summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS12
-rw-r--r--arch/alpha/include/asm/pgtable.h2
-rw-r--r--arch/arc/include/asm/pgtable-bits-arcv2.h2
-rw-r--r--arch/arm/include/asm/pgtable-nommu.h2
-rw-r--r--arch/arm/include/asm/pgtable.h4
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/mm/mmu.c47
-rw-r--r--arch/arm64/mm/pageattr.c3
-rw-r--r--arch/csky/include/asm/pgtable.h3
-rw-r--r--arch/hexagon/include/asm/page.h7
-rw-r--r--arch/ia64/include/asm/pgtable.h16
-rw-r--r--arch/ia64/mm/hugetlbpage.c15
-rw-r--r--arch/loongarch/include/asm/pgtable.h2
-rw-r--r--arch/m68k/include/asm/pgtable_mm.h2
-rw-r--r--arch/m68k/include/asm/pgtable_no.h1
-rw-r--r--arch/microblaze/include/asm/pgtable.h3
-rw-r--r--arch/mips/include/asm/pgtable.h2
-rw-r--r--arch/nios2/include/asm/pgtable.h2
-rw-r--r--arch/nios2/include/asm/processor.h3
-rw-r--r--arch/openrisc/include/asm/pgtable.h2
-rw-r--r--arch/parisc/include/asm/pgtable.h15
-rw-r--r--arch/parisc/kernel/pdt.c5
-rw-r--r--arch/powerpc/include/asm/pgtable.h7
-rw-r--r--arch/powerpc/mm/hugetlbpage.c37
-rw-r--r--arch/riscv/include/asm/pgtable.h2
-rw-r--r--arch/s390/include/asm/pgtable.h2
-rw-r--r--arch/sh/include/asm/pgtable.h2
-rw-r--r--arch/sparc/include/asm/pgtable_32.h6
-rw-r--r--arch/sparc/mm/init_32.c3
-rw-r--r--arch/sparc/mm/init_64.c1
-rw-r--r--arch/um/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_32.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c4
-rw-r--r--arch/x86/mm/init_64.c41
-rw-r--r--arch/xtensa/include/asm/pgtable.h2
-rw-r--r--drivers/acpi/numa/hmat.c7
-rw-r--r--drivers/base/memory.c38
-rw-r--r--drivers/block/zram/zram_drv.c13
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c2
-rw-r--r--fs/hugetlbfs/inode.c26
-rw-r--r--fs/proc/kcore.c33
-rw-r--r--include/linux/compiler-gcc.h21
-rw-r--r--include/linux/hugetlb.h113
-rw-r--r--include/linux/memory-tiers.h1
-rw-r--r--include/linux/memory.h18
-rw-r--r--include/linux/mm.h32
-rw-r--r--include/linux/mm_types.h14
-rw-r--r--include/linux/pagemap.h6
-rw-r--r--include/linux/swap.h5
-rw-r--r--include/linux/swapops.h24
-rw-r--r--include/trace/events/vmalloc.h123
-rw-r--r--kernel/cgroup/cpuset.c7
-rw-r--r--kernel/sysctl.c1
-rw-r--r--lib/Kconfig.kasan2
-rw-r--r--mm/debug_vm_pgtable.c8
-rw-r--r--mm/filemap.c28
-rw-r--r--mm/folio-compat.c7
-rw-r--r--mm/gup.c80
-rw-r--r--mm/gup_test.c141
-rw-r--r--mm/gup_test.h12
-rw-r--r--mm/huge_memory.c22
-rw-r--r--mm/hugetlb.c199
-rw-r--r--mm/internal.h12
-rw-r--r--mm/kasan/kasan.h8
-rw-r--r--mm/kasan/kasan_test.c148
-rw-r--r--mm/kasan/kasan_test_module.c60
-rw-r--r--mm/kasan/report.c31
-rw-r--r--mm/kasan/shadow.c2
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memcontrol.c32
-rw-r--r--mm/memory-failure.c156
-rw-r--r--mm/memory-tiers.c2
-rw-r--r--mm/memory.c2
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/mincore.c14
-rw-r--r--mm/mm_init.c8
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/mprotect.c5
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c23
-rw-r--r--mm/slub.c7
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c25
-rw-r--r--mm/swap.h8
-rw-r--r--mm/swap_state.c28
-rw-r--r--mm/truncate.c30
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c4
-rw-r--r--mm/workingset.c2
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile25
-rw-r--r--tools/testing/selftests/vm/anon_cow.c1126
-rw-r--r--tools/testing/selftests/vm/check_config.sh31
-rw-r--r--tools/testing/selftests/vm/hugepage-mremap.c21
-rw-r--r--tools/testing/selftests/vm/hugetlb-madvise.c12
-rw-r--r--tools/testing/selftests/vm/madv_populate.c8
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests.sh21
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c62
-rw-r--r--tools/testing/selftests/vm/vm_util.c15
-rw-r--r--tools/testing/selftests/vm/vm_util.h2
102 files changed, 2184 insertions, 1068 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 0e152a7afda8..7b6b8034811c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13294,10 +13294,20 @@ F: include/linux/memory_hotplug.h
F: include/linux/mm.h
F: include/linux/mmzone.h
F: include/linux/pagewalk.h
-F: include/linux/vmalloc.h
F: mm/
F: tools/testing/selftests/vm/
+VMALLOC
+M: Andrew Morton <akpm@linux-foundation.org>
+R: Uladzislau Rezki <urezki@gmail.com>
+R: Christoph Hellwig <hch@infradead.org>
+L: linux-mm@kvack.org
+S: Maintained
+W: http://www.linux-mm.org
+T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F: include/linux/vmalloc.h
+F: mm/vmalloc.c
+
MEMORY HOT(UN)PLUG
M: David Hildenbrand <david@redhat.com>
M: Oscar Salvador <osalvador@suse.de>
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 3ea9661c09ff..9e45f6735d5d 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -313,8 +313,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
#define pte_ERROR(e) \
printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
#define pmd_ERROR(e) \
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h
index b23be557403e..515e82db519f 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -120,8 +120,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#include <asm/hugepage.h>
#endif
diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h
index d16aba48fa0a..25d8c7bb07e0 100644
--- a/arch/arm/include/asm/pgtable-nommu.h
+++ b/arch/arm/include/asm/pgtable-nommu.h
@@ -21,8 +21,6 @@
#define pgd_none(pgd) (0)
#define pgd_bad(pgd) (0)
#define pgd_clear(pgdp)
-#define kern_addr_valid(addr) (1)
-/* FIXME */
/*
* PMD_SHIFT determines the size of the area a second-level page table can map
* PGDIR_SHIFT determines what a third-level page table entry can map
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 78a532068fec..00954ab1a039 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -298,10 +298,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
*/
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-/* FIXME: this is not correct */
-#define kern_addr_valid(addr) (1)
-
/*
* We provide our own arch_get_unmapped_area to cope with VIPT caches.
*/
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 71a1af42f0e8..4873c1d6e7d0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1021,8 +1021,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
*/
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
-extern int kern_addr_valid(unsigned long addr);
-
#ifdef CONFIG_ARM64_MTE
#define __HAVE_ARCH_PREPARE_TO_SWAP
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9a7c38965154..556154d821bf 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -814,53 +814,6 @@ void __init paging_init(void)
create_idmap();
}
-/*
- * Check whether a kernel address is valid (derived from arch/x86/).
- */
-int kern_addr_valid(unsigned long addr)
-{
- pgd_t *pgdp;
- p4d_t *p4dp;
- pud_t *pudp, pud;
- pmd_t *pmdp, pmd;
- pte_t *ptep, pte;
-
- addr = arch_kasan_reset_tag(addr);
- if ((((long)addr) >> VA_BITS) != -1UL)
- return 0;
-
- pgdp = pgd_offset_k(addr);
- if (pgd_none(READ_ONCE(*pgdp)))
- return 0;
-
- p4dp = p4d_offset(pgdp, addr);
- if (p4d_none(READ_ONCE(*p4dp)))
- return 0;
-
- pudp = pud_offset(p4dp, addr);
- pud = READ_ONCE(*pudp);
- if (pud_none(pud))
- return 0;
-
- if (pud_sect(pud))
- return pfn_valid(pud_pfn(pud));
-
- pmdp = pmd_offset(pudp, addr);
- pmd = READ_ONCE(*pmdp);
- if (pmd_none(pmd))
- return 0;
-
- if (pmd_sect(pmd))
- return pfn_valid(pmd_pfn(pmd));
-
- ptep = pte_offset_kernel(pmdp, addr);
- pte = READ_ONCE(*ptep);
- if (pte_none(pte))
- return 0;
-
- return pfn_valid(pte_pfn(pte));
-}
-
#ifdef CONFIG_MEMORY_HOTPLUG
static void free_hotplug_page_range(struct page *page, size_t size,
struct vmem_altmap *altmap)
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index d107c3d434e2..0a741a910a6a 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -201,8 +201,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
/*
* This function is used to determine if a linear map page has been marked as
- * not-valid. Walk the page table and check the PTE_VALID bit. This is based
- * on kern_addr_valid(), which almost does what we need.
+ * not-valid. Walk the page table and check the PTE_VALID bit.
*
* Because this is only called on the kernel linear map, p?d_sect() implies
* p?d_present(). When debug_pagealloc is enabled, sections mappings are
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index c3d9b92cbe61..77bc6caff2d2 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -249,9 +249,6 @@ extern void paging_init(void);
void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *pte);
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-#define kern_addr_valid(addr) (1)
-
#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
remap_pfn_range(vma, vaddr, pfn, size, prot)
diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h
index 7cbf719c578e..d7d4f9fca327 100644
--- a/arch/hexagon/include/asm/page.h
+++ b/arch/hexagon/include/asm/page.h
@@ -131,13 +131,6 @@ static inline void clear_page(void *page)
#define page_to_virt(page) __va(page_to_phys(page))
-/*
- * For port to Hexagon Virtual Machine, MAYBE we check for attempts
- * to reference reserved HVM space, but in any case, the VM will be
- * protected.
- */
-#define kern_addr_valid(addr) (1)
-
#include <asm/mem-layout.h>
#include <asm-generic/memory_model.h>
/* XXX Todo: implement assembly-optimized version of getorder. */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 6925e28ae61d..01517a5e6778 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -182,22 +182,6 @@ ia64_phys_addr_valid (unsigned long addr)
}
/*
- * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
- * memory. For the return value to be meaningful, ADDR must be >=
- * PAGE_OFFSET. This operation can be relatively expensive (e.g.,
- * require a hash-, or multi-level tree-lookup or something of that
- * sort) but it guarantees to return TRUE only if accessing the page
- * at that address does not cause an error. Note that there may be
- * addresses for which kern_addr_valid() returns FALSE even though an
- * access would not cause an error (e.g., this is typically true for
- * memory mapped I/O regions.
- *
- * XXX Need to implement this for IA-64.
- */
-#define kern_addr_valid(addr) (1)
-
-
-/*
* Now come the defines and routines to manage and access the three-level
* page table.
*/
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index f993cb36c062..380d2f3966c9 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
return 0;
}
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
-{
- struct page *page;
- pte_t *ptep;
-
- if (REGION_NUMBER(addr) != RGN_HPAGE)
- return ERR_PTR(-EINVAL);
-
- ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
- if (!ptep || pte_none(*ptep))
- return NULL;
- page = pte_page(*ptep);
- page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
- return page;
-}
int pmd_huge(pmd_t pmd)
{
return 0;
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index 10e0bd9009e2..b8d837ee6910 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -421,8 +421,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
__update_tlb(vma, address, (pte_t *)pmdp);
}
-#define kern_addr_valid(addr) (1)
-
static inline unsigned long pmd_pfn(pmd_t pmd)
{
return (pmd_val(pmd) & _PFN_MASK) >> _PFN_SHIFT;
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index 9b4e2fe2ac82..b93c41fe2067 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -145,8 +145,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
#endif /* !__ASSEMBLY__ */
-#define kern_addr_valid(addr) (1)
-
/* MMU-specific headers */
#ifdef CONFIG_SUN3
diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h
index bce5ca56c388..fed58da3a6b6 100644
--- a/arch/m68k/include/asm/pgtable_no.h
+++ b/arch/m68k/include/asm/pgtable_no.h
@@ -20,7 +20,6 @@
#define pgd_none(pgd) (0)
#define pgd_bad(pgd) (0)
#define pgd_clear(pgdp)
-#define kern_addr_valid(addr) (1)
#define pmd_offset(a, b) ((void *)0)
#define PAGE_NONE __pgprot(0)
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index ba348e997dbb..42f5988e998b 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -416,9 +416,6 @@ extern unsigned long iopa(unsigned long addr);
#define IOMAP_NOCACHE_NONSER 2
#define IOMAP_NO_COPYBACK 3
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-#define kern_addr_valid(addr) (1)
-
void do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code);
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 4678627673df..a68c0b01d8cd 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -550,8 +550,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
__update_tlb(vma, address, pte);
}
-#define kern_addr_valid(addr) (1)
-
/*
* Allow physical addresses to be fixed up to help 36-bit peripherals.
*/
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index b3d45e815295..ab793bc517f5 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -249,8 +249,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
#define __swp_entry_to_pte(swp) ((pte_t) { (swp).val })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
-#define kern_addr_valid(addr) (1)
-
extern void __init paging_init(void);
extern void __init mmu_init(void);
diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h
index 8916d93d5c2d..eb44130364a9 100644
--- a/arch/nios2/include/asm/processor.h
+++ b/arch/nios2/include/asm/processor.h
@@ -50,9 +50,6 @@ struct thread_struct {
unsigned long kpsr;
};
-#define INIT_MMAP \
- { &init_mm, (0), (0), __pgprot(0x0), VM_READ | VM_WRITE | VM_EXEC }
-
# define INIT_THREAD { \
.kregs = NULL, \
.ksp = 0, \
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index dcae8aea132f..6477c17b3062 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -395,8 +395,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
typedef pte_t *pte_addr_t;
#endif /* __ASSEMBLY__ */
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index ecd028854469..bd09a44cfb2d 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -23,21 +23,6 @@
#include <asm/processor.h>
#include <asm/cache.h>
-/*
- * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
- * memory. For the return value to be meaningful, ADDR must be >=
- * PAGE_OFFSET. This operation can be relatively expensive (e.g.,
- * require a hash-, or multi-level tree-lookup or something of that
- * sort) but it guarantees to return TRUE only if accessing the page
- * at that address does not cause an error. Note that there may be
- * addresses for which kern_addr_valid() returns FALSE even though an
- * access would not cause an error (e.g., this is typically true for
- * memory mapped I/O regions.
- *
- * XXX Need to implement this for parisc.
- */
-#define kern_addr_valid(addr) (1)
-
/* This is for the serialization of PxTLB broadcasts. At least on the N class
* systems, only one PxTLB inter processor broadcast can be active at any one
* time on the Merced bus. */
diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c
index e391b175f5ec..80943a00e245 100644
--- a/arch/parisc/kernel/pdt.c
+++ b/arch/parisc/kernel/pdt.c
@@ -18,8 +18,7 @@
#include <linux/kthread.h>
#include <linux/initrd.h>
#include <linux/pgtable.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/mm.h>
#include <asm/pdc.h>
#include <asm/pdcpat.h>
@@ -232,7 +231,7 @@ void __init pdc_pdt_init(void)
/* mark memory page bad */
memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(addr >> PAGE_SHIFT);
}
}
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 283f40d05a4d..9972626ddaf6 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -81,13 +81,6 @@ void poking_init(void);
extern unsigned long ioremap_bot;
extern const pgprot_t protection_map[16];
-/*
- * kern_addr_valid is intended to indicate whether an address is a valid
- * kernel address. Most 32-bit archs define it as always true (like this)
- * but most 64-bit archs actually perform a test. What should we do here?
- */
-#define kern_addr_valid(addr) (1)
-
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_large(pmd) 0
#endif
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 5852a86d990d..f1ba8d1e8c1a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
} while (addr = next, addr != end);
}
-struct page *follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd,
- int flags, int pdshift)
-{
- pte_t *ptep;
- spinlock_t *ptl;
- struct page *page = NULL;
- unsigned long mask;
- int shift = hugepd_shift(hpd);
- struct mm_struct *mm = vma->vm_mm;
-
-retry:
- /*
- * hugepage directory entries are protected by mm->page_table_lock
- * Use this instead of huge_pte_lockptr
- */
- ptl = &mm->page_table_lock;
- spin_lock(ptl);
-
- ptep = hugepte_offset(hpd, address, pdshift);
- if (pte_present(*ptep)) {
- mask = (1UL << shift) - 1;
- page = pte_page(*ptep);
- page += ((address & mask) >> PAGE_SHIFT);
- if (flags & FOLL_GET)
- get_page(page);
- } else {
- if (is_hugetlb_entry_migration(*ptep)) {
- spin_unlock(ptl);
- __migration_entry_wait(mm, ptep, ptl);
- goto retry;
- }
- }
- spin_unlock(ptl);
- return page;
-}
-
bool __init arch_hugetlb_valid_size(unsigned long size)
{
int shift = __ffs(size);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 92ec2d9d7273..7ee3ac315c7c 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -802,8 +802,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
#endif /* !CONFIG_MMU */
-#define kern_addr_valid(addr) (1) /* FIXME */
-
extern char _start[];
extern void *_dtb_early_va;
extern uintptr_t _dtb_early_pa;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 11e901286414..b26cbf1c533c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1774,8 +1774,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
extern int vmem_add_mapping(unsigned long start, unsigned long size);
extern void vmem_remove_mapping(unsigned long start, unsigned long size);
extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc);
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 6fb9ec54cf9b..3ce30becf6df 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -92,8 +92,6 @@ static inline unsigned long phys_addr_mask(void)
typedef pte_t *pte_addr_t;
-#define kern_addr_valid(addr) (1)
-
#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
struct vm_area_struct;
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 8ff549004fac..5acc05b572e6 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -368,12 +368,6 @@ __get_iospace (unsigned long addr)
}
}
-extern unsigned long *sparc_valid_addr_bitmap;
-
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-#define kern_addr_valid(addr) \
- (test_bit(__pa((unsigned long)(addr))>>20, sparc_valid_addr_bitmap))
-
/*
* For sparc32&64, the pfn in io_remap_pfn_range() carries <iospace> in
* its high 4 bits. These macros/functions put it there or get it from there.
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index d88e774c8eb4..9c0ea457bdf0 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -37,8 +37,7 @@
#include "mm_32.h"
-unsigned long *sparc_valid_addr_bitmap;
-EXPORT_SYMBOL(sparc_valid_addr_bitmap);
+static unsigned long *sparc_valid_addr_bitmap;
unsigned long phys_base;
EXPORT_SYMBOL(phys_base);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index d6faee23c77d..04f9db0c3111 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1667,7 +1667,6 @@ bool kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(*pte));
}
-EXPORT_SYMBOL(kern_addr_valid);
static unsigned long __ref kernel_map_hugepud(unsigned long vstart,
unsigned long vend,
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 66bc3f99d9be..4e3052f2671a 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -298,8 +298,6 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
((swp_entry_t) { pte_val(pte_mkuptodate(pte)) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
/* Clear a kernel PTE and flush it from the TLB */
#define kpte_clear_flush(ptep, vaddr) \
do { \
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 7c9c968a42ef..7d4ad8907297 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -48,15 +48,6 @@ do { \
#endif /* !__ASSEMBLY__ */
/*
- * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM
- */
-#ifdef CONFIG_FLATMEM
-#define kern_addr_valid(addr) (1)
-#else
-#define kern_addr_valid(kaddr) (0)
-#endif
-
-/*
* This is used to calculate the .brk reservation for initial pagetables.
* Enough space is reserved to allocate pagetables sufficient to cover all
* of LOWMEM_PAGES, which is an upper bound on the size of the direct map of
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e479491da8d5..7929327abe00 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -240,7 +240,6 @@ static inline void native_pgd_clear(pgd_t *pgd)
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
-extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 1ec20807de1e..6225c525372d 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -268,7 +268,7 @@ static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
unsigned long addr,
unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *entry;
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
@@ -502,7 +502,7 @@ static void sgx_vma_open(struct vm_area_struct *vma)
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
unsigned long end, unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *page;
unsigned long count = 0;
int ret = 0;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3f040c6e5d13..e8db4edd7cc9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1416,47 +1416,6 @@ void mark_rodata_ro(void)
debug_checkwx();
}
-int kern_addr_valid(unsigned long addr)
-{
- unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- if (above != 0 && above != -1UL)
- return 0;
-
- pgd = pgd_offset_k(addr);
- if (pgd_none(*pgd))
- return 0;
-
- p4d = p4d_offset(pgd, addr);
- if (!p4d_present(*p4d))
- return 0;
-
- pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
- return 0;
-
- if (pud_large(*pud))
- return pfn_valid(pud_pfn(*pud));
-
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- return 0;
-
- if (pmd_large(*pmd))
- return pfn_valid(pmd_pfn(*pmd));
-
- pte = pte_offset_kernel(pmd, addr);
- if (pte_none(*pte))
- return 0;
-
- return pfn_valid(pte_pfn(*pte));
-}
-
/*
* Block size is the minimum amount of memory which can be hotplugged or
* hotremoved. It must be power of two and must be equal or larger than
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index 54f577c13afa..5b5484d707b2 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -386,8 +386,6 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
#else
-#define kern_addr_valid(addr) (1)
-
extern void update_mmu_cache(struct vm_area_struct * vma,
unsigned long address, pte_t *ptep);
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 23f49a2f4d14..139e3b41653e 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -767,11 +767,6 @@ static int hmat_callback(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block hmat_callback_nb = {
- .notifier_call = hmat_callback,
- .priority = 2,
-};
-
static __init void hmat_free_structures(void)
{
struct memory_target *target, *tnext;
@@ -854,7 +849,7 @@ static __init int hmat_init(void)
hmat_register_targets();
/* Keep the table and structures if the notifier may use them */
- if (!register_hotmemory_notifier(&hmat_callback_nb))
+ if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI))
return 0;
out_put:
hmat_free_structures();
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 9aa0da991cfb..fe98fb8d94e5 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -175,6 +175,15 @@ int memory_notify(unsigned long val, void *v)
return blocking_notifier_call_chain(&memory_chain, val, v);
}
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+static unsigned long memblk_nr_poison(struct memory_block *mem);
+#else
+static inline unsigned long memblk_nr_poison(struct memory_block *mem)
+{
+ return 0;
+}
+#endif
+
static int memory_block_online(struct memory_block *mem)
{
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
@@ -183,6 +192,9 @@ static int memory_block_online(struct memory_block *mem)
struct zone *zone;
int ret;
+ if (memblk_nr_poison(mem))
+ return -EHWPOISON;
+
zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
start_pfn, nr_pages);
@@ -864,6 +876,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
continue;
+ num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
unregister_memory_block_under_nodes(mem);
remove_memory_block(mem);
}
@@ -1164,3 +1177,28 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
}
return ret;
}
+
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+void memblk_nr_poison_inc(unsigned long pfn)
+{
+ const unsigned long block_id = pfn_to_block_id(pfn);
+ struct memory_block *mem = find_memory_block_by_id(block_id);
+
+ if (mem)
+ atomic_long_inc(&mem->nr_hwpoison);
+}
+
+void memblk_nr_poison_sub(unsigned long pfn, long i)
+{
+ const unsigned long block_id = pfn_to_block_id(pfn);
+ struct memory_block *mem = find_memory_block_by_id(block_id);
+
+ if (mem)
+ atomic_long_sub(i, &mem->nr_hwpoison);
+}
+
+static unsigned long memblk_nr_poison(struct memory_block *mem)
+{
+ return atomic_long_read(&mem->nr_hwpoison);
+}
+#endif
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 966aab902d19..87711ddf4b54 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -188,16 +188,13 @@ static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
static inline void update_used_max(struct zram *zram,
const unsigned long pages)
{
- unsigned long old_max, cur_max;
-
- old_max = atomic_long_read(&zram->stats.max_used_pages);
+ unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages);
do {
- cur_max = old_max;
- if (pages > cur_max)
- old_max = atomic_long_cmpxchg(
- &zram->stats.max_used_pages, cur_max, pages);
- } while (old_max != cur_max);
+ if (cur_max >= pages)
+ return;
+ } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages,
+ &cur_max, pages));
}
static inline void zram_fill_page(void *ptr, unsigned long len,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 8ef31d687ef3..4728be161828 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -255,7 +255,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str
* becoming writable and makes is_cow_mapping(vm_flags) false.
*/
if (is_cow_mapping(vma->vm_flags) &&
- !(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
+ !(vma->vm_flags & VM_ACCESS_FLAGS))
vma->vm_flags &= ~VM_MAYWRITE;
return drm_gem_ttm_mmap(obj, vma);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index df7772335dc0..3ee84604e36d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -370,11 +370,11 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void hugetlb_delete_from_page_cache(struct page *page)
+static void hugetlb_delete_from_page_cache(struct folio *folio)
{
- ClearPageDirty(page);
- ClearPageUptodate(page);
- delete_from_page_cache(page);
+ folio_clear_dirty(folio);
+ folio_clear_uptodate(folio);
+ filemap_remove_folio(folio);
}
/*
@@ -580,8 +580,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
* map could fail. Correspondingly, the subpool and global
* reserve usage count can need to be adjusted.
*/
- VM_BUG_ON(HPageRestoreReserve(&folio->page));
- hugetlb_delete_from_page_cache(&folio->page);
+ VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
+ hugetlb_delete_from_page_cache(folio);
ret = true;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode, index,
@@ -1097,10 +1097,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- if (hugetlb_page_subpool(&src->page)) {
- hugetlb_set_page_subpool(&dst->page,
- hugetlb_page_subpool(&src->page));
- hugetlb_set_page_subpool(&src->page, NULL);
+ if (hugetlb_folio_subpool(src)) {
+ hugetlb_set_folio_subpool(dst,
+ hugetlb_folio_subpool(src));
+ hugetlb_set_folio_subpool(src, NULL);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1377,7 +1377,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->max_size_opt = memparse(param->string, &rest);
ctx->max_val_type = SIZE_STD;
@@ -1387,7 +1387,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_nr_inodes:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->nr_inodes = memparse(param->string, &rest);
return 0;
@@ -1403,7 +1403,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_min_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->min_size_opt = memparse(param->string, &rest);
ctx->min_val_type = SIZE_STD;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index dff921f7ca33..71157ee35c1a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -18,7 +18,6 @@
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <linux/notifier.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
@@ -541,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
fallthrough;
case KCORE_VMEMMAP:
case KCORE_TEXT:
- if (kern_addr_valid(start)) {
- /*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
- */
- if (copy_from_kernel_nofault(buf, (void *)start,
- tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
- }
+ /*
+ * Using bounce buffer to bypass the
+ * hardened user copy kernel text checks.
+ */
+ if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
}
} else {
- if (clear_user(buffer, tsz)) {
+ if (copy_to_user(buffer, buf, tsz)) {
ret = -EFAULT;
goto out;
}
@@ -638,10 +629,6 @@ static int __meminit kcore_callback(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block kcore_callback_nb __meminitdata = {
- .notifier_call = kcore_callback,
- .priority = 0,
-};
static struct kcore_list kcore_vmalloc;
@@ -694,7 +681,7 @@ static int __init proc_kcore_init(void)
add_modules_range();
/* Store direct-map area from physical memory map */
kcore_update_ram();
- register_hotmemory_notifier(&kcore_callback_nb);
+ hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI);
return 0;
}
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index f55a37efdb97..7af9e34ec261 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -82,26 +82,21 @@
#define __noscs __attribute__((__no_sanitize__("shadow-call-stack")))
#endif
-#if __has_attribute(__no_sanitize_address__)
-#define __no_sanitize_address __attribute__((no_sanitize_address))
-#else
-#define __no_sanitize_address
-#endif
+#define __no_sanitize_address __attribute__((__no_sanitize_address__))
-#if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__)
-#define __no_sanitize_thread __attribute__((no_sanitize_thread))
+#if defined(__SANITIZE_THREAD__)
+#define __no_sanitize_thread __attribute__((__no_sanitize_thread__))
#else
#define __no_sanitize_thread
#endif
-#if __has_attribute(__no_sanitize_undefined__)
-#define __no_sanitize_undefined __attribute__((no_sanitize_undefined))
-#else
-#define __no_sanitize_undefined
-#endif
+#define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__))
+/*
+ * Only supported since gcc >= 12
+ */
#if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__)
-#define __no_sanitize_coverage __attribute__((no_sanitize_coverage))
+#define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__))
#else
#define __no_sanitize_coverage
#endif
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8b4f93e84868..65ea34022aa2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -149,6 +149,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long len);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
struct vm_area_struct *, struct vm_area_struct *);
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags);
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
struct page **, struct vm_area_struct **,
unsigned long *, unsigned long *, long, unsigned int,
@@ -181,8 +183,9 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
int isolate_hugetlb(struct page *page, struct list_head *list);
-int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison);
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared);
void putback_active_hugepage(struct page *page);
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
@@ -209,17 +212,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
- int write);
-struct page *follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd,
- int flags, int pdshift);
-struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
- int flags);
-struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags);
-struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
- pgd_t *pgd, int flags);
void hugetlb_vma_lock_read(struct vm_area_struct *vma);
void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
@@ -272,6 +264,12 @@ static inline void adjust_range_if_pmd_sharing_possible(
{
}
+static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
+}
+
static inline long follow_hugetlb_page(struct mm_struct *mm,
struct vm_area_struct *vma, struct page **pages,
struct vm_area_struct **vmas, unsigned long *position,
@@ -282,12 +280,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm,
return 0;
}
-static inline struct page *follow_huge_addr(struct mm_struct *mm,
- unsigned long address, int write)
-{
- return ERR_PTR(-EINVAL);
-}
-
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
struct mm_struct *src,
struct vm_area_struct *dst_vma,
@@ -320,31 +312,6 @@ static inline void hugetlb_show_meminfo_node(int nid)
{
}
-static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd, int flags,
- int pdshift)
-{
- return NULL;
-}
-
-static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma,
- unsigned long address, int flags)
-{
- return NULL;
-}
-
-static inline struct page *follow_huge_pud(struct mm_struct *mm,
- unsigned long address, pud_t *pud, int flags)
-{
- return NULL;
-}
-
-static inline struct page *follow_huge_pgd(struct mm_struct *mm,
- unsigned long address, pgd_t *pgd, int flags)
-{
- return NULL;
-}
-
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len)
{
@@ -425,12 +392,13 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list)
return -EBUSY;
}
-static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
{
return 0;
}
-static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
return 0;
}
@@ -623,26 +591,50 @@ enum hugetlb_page_flags {
*/
#ifdef CONFIG_HUGETLB_PAGE
#define TESTHPAGEFLAG(uname, flname) \
+static __always_inline \
+bool folio_test_hugetlb_##flname(struct folio *folio) \
+ { void *private = &folio->private; \
+ return test_bit(HPG_##flname, private); \
+ } \
static inline int HPage##uname(struct page *page) \
{ return test_bit(HPG_##flname, &(page->private)); }
#define SETHPAGEFLAG(uname, flname) \
+static __always_inline \
+void folio_set_hugetlb_##flname(struct folio *folio) \
+ { void *private = &folio->private; \
+ set_bit(HPG_##flname, private); \
+ } \
static inline void SetHPage##uname(struct page *page) \
{ set_bit(HPG_##flname, &(page->private)); }
#define CLEARHPAGEFLAG(uname, flname) \
+static __always_inline \
+void folio_clear_hugetlb_##flname(struct folio *folio) \
+ { void *private = &folio->private; \
+ clear_bit(HPG_##flname, private); \
+ } \
static inline void ClearHPage##uname(struct page *page) \
{ clear_bit(HPG_##flname, &(page->private)); }
#else
#define TESTHPAGEFLAG(uname, flname) \
+static inline bool \
+folio_test_hugetlb_##flname(struct folio *folio) \
+ { return 0; } \
static inline int HPage##uname(struct page *page) \
{ return 0; }
#define SETHPAGEFLAG(uname, flname) \
+static inline void \
+folio_set_hugetlb_##flname(struct folio *folio) \
+ { } \
static inline void SetHPage##uname(struct page *page) \
{ }
#define CLEARHPAGEFLAG(uname, flname) \
+static inline void \
+folio_clear_hugetlb_##flname(struct folio *folio) \
+ { } \
static inline void ClearHPage##uname(struct page *page) \
{ }
#endif
@@ -728,18 +720,29 @@ extern unsigned int default_hstate_idx;
#define default_hstate (hstates[default_hstate_idx])
+static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
+{
+ return (void *)folio_get_private_1(folio);
+}
+
/*
* hugetlb page subpool pointer located in hpage[1].private
*/
static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
{
- return (void *)page_private(hpage + SUBPAGE_INDEX_SUBPOOL);
+ return hugetlb_folio_subpool(page_folio(hpage));
+}
+
+static inline void hugetlb_set_folio_subpool(struct folio *folio,
+ struct hugepage_subpool *subpool)
+{
+ folio_set_private_1(folio, (unsigned long)subpool);
}
static inline void hugetlb_set_page_subpool(struct page *hpage,
struct hugepage_subpool *subpool)
{
- set_page_private(hpage + SUBPAGE_INDEX_SUBPOOL, (unsigned long)subpool);
+ hugetlb_set_folio_subpool(page_folio(hpage), subpool);
}
static inline struct hstate *hstate_file(struct file *f)
@@ -823,10 +826,15 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
}
#endif
+static inline struct hstate *folio_hstate(struct folio *folio)
+{
+ VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+ return size_to_hstate(folio_size(folio));
+}
+
static inline struct hstate *page_hstate(struct page *page)
{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- return size_to_hstate(page_size(page));
+ return folio_hstate(page_folio(page));
}
static inline unsigned hstate_index_to_shift(unsigned index)
@@ -1035,6 +1043,11 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
return NULL;
}
+static inline struct hstate *folio_hstate(struct folio *folio)
+{
+ return NULL;
+}
+
static inline struct hstate *page_hstate(struct page *page)
{
return NULL;
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 965009aa01d7..fc9647b1b4f9 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -18,7 +18,6 @@
* the same memory tier.
*/
#define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
-#define MEMTIER_HOTPLUG_PRIO 100
struct memory_tier;
struct memory_dev_type {
diff --git a/include/linux/memory.h b/include/linux/memory.h
index aa619464a1df..31343566c221 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -19,7 +19,6 @@
#include <linux/node.h>
#include <linux/compiler.h>
#include <linux/mutex.h>
-#include <linux/notifier.h>
#define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS)
@@ -85,6 +84,9 @@ struct memory_block {
unsigned long nr_vmemmap_pages;
struct memory_group *group; /* group (if any) for this block */
struct list_head group_next; /* next block inside memory group */
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+ atomic_long_t nr_hwpoison;
+#endif
};
int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -113,8 +115,13 @@ struct mem_section;
* Priorities for the hotplug memory callback routines (stored in decreasing
* order in the callback chain)
*/
-#define SLAB_CALLBACK_PRI 1
-#define IPC_CALLBACK_PRI 10
+#define DEFAULT_CALLBACK_PRI 0
+#define SLAB_CALLBACK_PRI 1
+#define HMAT_CALLBACK_PRI 2
+#define MM_COMPUTE_BATCH_PRI 10
+#define CPUSET_CALLBACK_PRI 10
+#define MEMTIER_HOTPLUG_PRI 100
+#define KSM_CALLBACK_PRI 100
#ifndef CONFIG_MEMORY_HOTPLUG
static inline void memory_dev_init(void)
@@ -136,9 +143,6 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
{
return 0;
}
-/* These aren't inline functions due to a GCC bug. */
-#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; })
-#define unregister_hotmemory_notifier(nb) ({ (void)(nb); })
#else /* CONFIG_MEMORY_HOTPLUG */
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
@@ -166,8 +170,6 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
{ .notifier_call = fn, .priority = pri };\
register_memory_notifier(&fn##_mem_nb); \
})
-#define register_hotmemory_notifier(nb) register_memory_notifier(nb)
-#define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb)
#ifdef CONFIG_NUMA
void memory_block_add_nid(struct memory_block *mem, int nid,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 974ccca609d2..f919befc8fac 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -74,6 +74,7 @@ static inline void totalram_pages_add(long count)
extern void * high_memory;
extern int page_cluster;
+extern const int page_cluster_max;
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
@@ -549,7 +550,7 @@ struct vm_operations_struct {
/*
* Called by mprotect() to make driver-specific permission
* checks before mprotect() is finalised. The VMA must not
- * be modified. Returns 0 if eprotect() can proceed.
+ * be modified. Returns 0 if mprotect() can proceed.
*/
int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
unsigned long end, unsigned long newflags);
@@ -3298,12 +3299,37 @@ extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
-extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared);
+void num_poisoned_pages_inc(unsigned long pfn);
+void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
-static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
return 0;
}
+
+static inline void num_poisoned_pages_inc(unsigned long pfn)
+{
+}
+
+static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+{
+}
+#endif
+
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+extern void memblk_nr_poison_inc(unsigned long pfn);
+extern void memblk_nr_poison_sub(unsigned long pfn, long i);
+#else
+static inline void memblk_nr_poison_inc(unsigned long pfn)
+{
+}
+
+static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
+{
+}
#endif
#ifndef arch_memory_failure
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 500e536796ca..2d5b1575ffe0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -144,6 +144,7 @@ struct page {
atomic_t compound_pincount;
#ifdef CONFIG_64BIT
unsigned int compound_nr; /* 1 << compound_order */
+ unsigned long _private_1;
#endif
};
struct { /* Second tail page of compound page */
@@ -264,6 +265,7 @@ struct page {
* @_total_mapcount: Do not use directly, call folio_entire_mapcount().
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
* @_folio_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_private_1: Do not use directly, call folio_get_private_1().
*
* A folio is a physically, virtually and logically contiguous set
* of bytes. It is a power-of-two in size, and it is aligned to that
@@ -311,6 +313,7 @@ struct folio {
#ifdef CONFIG_64BIT
unsigned int _folio_nr_pages;
#endif
+ unsigned long _private_1;
};
#define FOLIO_MATCH(pg, fl) \
@@ -338,6 +341,7 @@ FOLIO_MATCH(compound_mapcount, _total_mapcount);
FOLIO_MATCH(compound_pincount, _pincount);
#ifdef CONFIG_64BIT
FOLIO_MATCH(compound_nr, _folio_nr_pages);
+FOLIO_MATCH(_private_1, _private_1);
#endif
#undef FOLIO_MATCH
@@ -383,6 +387,16 @@ static inline void *folio_get_private(struct folio *folio)
return folio->private;
}
+static inline void folio_set_private_1(struct folio *folio, unsigned long private)
+{
+ folio->_private_1 = private;
+}
+
+static inline unsigned long folio_get_private_1(struct folio *folio)
+{
+ return folio->_private_1;
+}
+
struct page_frag_cache {
void * va;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bbccb4044222..b33ab86d5dca 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -504,9 +504,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
#define FGP_FOR_MMAP 0x00000040
-#define FGP_HEAD 0x00000080
-#define FGP_ENTRY 0x00000100
-#define FGP_STABLE 0x00000200
+#define FGP_ENTRY 0x00000080
+#define FGP_STABLE 0x00000100
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
int fgp_flags, gfp_t gfp);
@@ -1102,7 +1101,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
pgoff_t index, gfp_t gfp);
void filemap_remove_folio(struct folio *folio);
-void delete_from_page_cache(struct page *page);
void __filemap_remove_folio(struct folio *folio, void *shadow);
void replace_page_cache_page(struct page *old, struct page *new);
void delete_from_page_cache_batch(struct address_space *mapping,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a18cf4b7c724..369d7799205d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -384,8 +384,9 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */
-void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages);
-void lru_note_cost_folio(struct folio *);
+void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated);
+void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void lru_cache_add(struct page *);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 86b95ccb81bb..3ba9bf56899d 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -581,8 +581,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
#ifdef CONFIG_MEMORY_FAILURE
-extern atomic_long_t num_poisoned_pages __read_mostly;
-
/*
* Support for hardware poisoned pages
*/
@@ -597,17 +595,7 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
return swp_type(entry) == SWP_HWPOISON;
}
-static inline void num_poisoned_pages_inc(void)
-{
- atomic_long_inc(&num_poisoned_pages);
-}
-
-static inline void num_poisoned_pages_sub(long i)
-{
- atomic_long_sub(i, &num_poisoned_pages);
-}
-
-#else /* CONFIG_MEMORY_FAILURE */
+#else
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
@@ -618,15 +606,7 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
{
return 0;
}
-
-static inline void num_poisoned_pages_inc(void)
-{
-}
-
-static inline void num_poisoned_pages_sub(long i)
-{
-}
-#endif /* CONFIG_MEMORY_FAILURE */
+#endif
static inline int non_swap_entry(swp_entry_t entry)
{
diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h
new file mode 100644
index 000000000000..ad4e02191f35
--- /dev/null
+++ b/include/trace/events/vmalloc.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vmalloc
+
+#if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VMALLOC_H
+
+#include <linux/tracepoint.h>
+
+/**
+ * alloc_vmap_area - called when a new vmap allocation occurs
+ * @addr: an allocated address
+ * @size: a requested size
+ * @align: a requested alignment
+ * @vstart: a requested start range
+ * @vend: a requested end range
+ * @failed: an allocation failed or not
+ *
+ * This event is used for a debug purpose, it can give an extra
+ * information for a developer about how often it occurs and which
+ * parameters are passed for further validation.
+ */
+TRACE_EVENT(alloc_vmap_area,
+
+ TP_PROTO(unsigned long addr, unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend, int failed),
+
+ TP_ARGS(addr, size, align, vstart, vend, failed),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, addr)
+ __field(unsigned long, size)
+ __field(unsigned long, align)
+ __field(unsigned long, vstart)
+ __field(unsigned long, vend)
+ __field(int, failed)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->size = size;
+ __entry->align = align;
+ __entry->vstart = vstart;
+ __entry->vend = vend;
+ __entry->failed = failed;
+ ),
+
+ TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d",
+ __entry->addr, __entry->size, __entry->align,
+ __entry->vstart, __entry->vend, __entry->failed)
+);
+
+/**
+ * purge_vmap_area_lazy - called when vmap areas were lazily freed
+ * @start: purging start address
+ * @end: purging end address
+ * @npurged: numbed of purged vmap areas
+ *
+ * This event is used for a debug purpose. It gives some
+ * indication about start:end range and how many objects
+ * are released.
+ */
+TRACE_EVENT(purge_vmap_area_lazy,
+
+ TP_PROTO(unsigned long start, unsigned long end,
+ unsigned int npurged),
+
+ TP_ARGS(start, end, npurged),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, start)
+ __field(unsigned long, end)
+ __field(unsigned int, npurged)
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->npurged = npurged;
+ ),
+
+ TP_printk("start=0x%lx end=0x%lx num_purged=%u",
+ __entry->start, __entry->end, __entry->npurged)
+);
+
+/**
+ * free_vmap_area_noflush - called when a vmap area is freed
+ * @va_start: a start address of VA
+ * @nr_lazy: number of current lazy pages
+ * @nr_lazy_max: number of maximum lazy pages
+ *
+ * This event is used for a debug purpose. It gives some
+ * indication about a VA that is released, number of current
+ * outstanding areas and a maximum allowed threshold before
+ * dropping all of them.
+ */
+TRACE_EVENT(free_vmap_area_noflush,
+
+ TP_PROTO(unsigned long va_start, unsigned long nr_lazy,
+ unsigned long nr_lazy_max),
+
+ TP_ARGS(va_start, nr_lazy, nr_lazy_max),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, va_start)
+ __field(unsigned long, nr_lazy)
+ __field(unsigned long, nr_lazy_max)
+ ),
+
+ TP_fast_assign(
+ __entry->va_start = va_start;
+ __entry->nr_lazy = nr_lazy;
+ __entry->nr_lazy_max = nr_lazy_max;
+ ),
+
+ TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu",
+ __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max)
+);
+
+#endif /* _TRACE_VMALLOC_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b474289c15b8..3ea2e836e93e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3630,11 +3630,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
-};
-
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -3652,7 +3647,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+ hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 188c305aeb8b..71a4350ac601 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2125,6 +2125,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
},
{
.procname = "dirtytime_expire_seconds",
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index ca09b1cf8ee9..ba5b27962c34 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -181,7 +181,7 @@ config KASAN_VMALLOC
config KASAN_KUNIT_TEST
tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS
- depends on KASAN && KUNIT
+ depends on KASAN && KUNIT && TRACEPOINTS
default KUNIT_ALL_TESTS
help
A KUnit-based KASAN test suite. Triggers different kinds of
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index dc7df1254f0a..2b61fde8c38c 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -38,11 +38,7 @@
* Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics
* expectations that are being validated here. All future changes in here
* or the documentation need to be in sync.
- */
-
-#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
-
-/*
+ *
* On s390 platform, the lower 4 bits are used to identify given page table
* entry type. But these bits might affect the ability to clear entries with
* pxx_clear() because of how dynamic page table folding works on s390. So
@@ -1125,7 +1121,7 @@ static int __init init_args(struct pgtable_debug_args *args)
*/
memset(args, 0, sizeof(*args));
args->vaddr = get_random_vaddr();
- args->page_prot = vm_get_page_prot(VMFLAGS);
+ args->page_prot = vm_get_page_prot(VM_ACCESS_FLAGS);
args->page_prot_none = vm_get_page_prot(VM_NONE);
args->is_contiguous_page = false;
args->pud_pfn = ULONG_MAX;
diff --git a/mm/filemap.c b/mm/filemap.c
index 08341616ae7a..65eee6ec1066 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2048,10 +2048,10 @@ reset:
*
* Return: The number of entries which were found.
*/
-unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
- XA_STATE(xas, &mapping->i_pages, start);
+ XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;
rcu_read_lock();
@@ -2062,6 +2062,15 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
}
rcu_read_unlock();
+ if (folio_batch_count(fbatch)) {
+ unsigned long nr = 1;
+ int idx = folio_batch_count(fbatch) - 1;
+
+ folio = fbatch->folios[idx];
+ if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ nr = folio_nr_pages(folio);
+ *start = indices[idx] + nr;
+ }
return folio_batch_count(fbatch);
}
@@ -2085,16 +2094,16 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
*
* Return: The number of entries which were found.
*/
-unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
- XA_STATE(xas, &mapping->i_pages, start);
+ XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;
rcu_read_lock();
while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
if (!xa_is_value(folio)) {
- if (folio->index < start)
+ if (folio->index < *start)
goto put;
if (folio->index + folio_nr_pages(folio) - 1 > end)
goto put;
@@ -2117,6 +2126,15 @@ put:
}
rcu_read_unlock();
+ if (folio_batch_count(fbatch)) {
+ unsigned long nr = 1;
+ int idx = folio_batch_count(fbatch) - 1;
+
+ folio = fbatch->folios[idx];
+ if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ nr = folio_nr_pages(folio);
+ *start = indices[idx] + nr;
+ }
return folio_batch_count(fbatch);
}
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index e1e23b4947d7..bac2a366aada 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -108,7 +108,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
struct folio *folio;
folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
- if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio))
+ if (!folio || xa_is_value(folio))
return &folio->page;
return folio_file_page(folio, index);
}
@@ -124,11 +124,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
-void delete_from_page_cache(struct page *page)
-{
- return filemap_remove_folio(page_folio(page));
-}
-
int try_to_release_page(struct page *page, gfp_t gfp)
{
return filemap_release_folio(page_folio(page), gfp);
diff --git a/mm/gup.c b/mm/gup.c
index fe195d47de74..6b16aecf5d2c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -537,18 +537,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
-
- /*
- * Considering PTE level hugetlb, like continuous-PTE hugetlb on
- * ARM64 architecture.
- */
- if (is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd_pte(vma, address, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
-
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
@@ -680,20 +668,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd_pte(vma, address, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pmd_val(pmdval)), flags,
- PMD_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
retry:
if (!pmd_present(pmdval)) {
/*
@@ -783,20 +757,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pud(mm, address, pud, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pud_val(*pud)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pud_val(*pud)), flags,
- PUD_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
@@ -816,7 +776,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
struct follow_page_context *ctx)
{
p4d_t *p4d;
- struct page *page;
p4d = p4d_offset(pgdp, address);
if (p4d_none(*p4d))
@@ -825,14 +784,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
if (unlikely(p4d_bad(*p4d)))
return no_page_table(vma, flags);
- if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(p4d_val(*p4d)), flags,
- P4D_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
return follow_pud_mask(vma, address, p4d, flags, ctx);
}
@@ -870,10 +821,18 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
ctx->page_mask = 0;
- /* make this handle hugepd */
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+ /*
+ * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
+ * special hugetlb page table walking code. This eliminates the
+ * need to check for hugetlb entries in the general walking code.
+ *
+ * hugetlb_follow_page_mask is only for follow_page() handling here.
+ * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = hugetlb_follow_page_mask(vma, address, flags);
+ if (!page)
+ page = no_page_table(vma, flags);
return page;
}
@@ -882,21 +841,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags);
- if (pgd_huge(*pgd)) {
- page = follow_huge_pgd(mm, address, pgd, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pgd_val(*pgd)), flags,
- PGDIR_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
-
return follow_p4d_mask(vma, address, pgd, flags, ctx);
}
diff --git a/mm/gup_test.c b/mm/gup_test.c
index 12b0a91767d3..0d76d9b4bb5a 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -203,6 +203,135 @@ free_pages:
return ret;
}
+static DEFINE_MUTEX(pin_longterm_test_mutex);
+static struct page **pin_longterm_test_pages;
+static unsigned long pin_longterm_test_nr_pages;
+
+static inline void pin_longterm_test_stop(void)
+{
+ if (pin_longterm_test_pages) {
+ if (pin_longterm_test_nr_pages)
+ unpin_user_pages(pin_longterm_test_pages,
+ pin_longterm_test_nr_pages);
+ kfree(pin_longterm_test_pages);
+ pin_longterm_test_pages = NULL;
+ pin_longterm_test_nr_pages = 0;
+ }
+}
+
+static inline int pin_longterm_test_start(unsigned long arg)
+{
+ long nr_pages, cur_pages, addr, remaining_pages;
+ int gup_flags = FOLL_LONGTERM;
+ struct pin_longterm_test args;
+ struct page **pages;
+ int ret = 0;
+ bool fast;
+
+ if (pin_longterm_test_pages)
+ return -EINVAL;
+
+ if (copy_from_user(&args, (void __user *)arg, sizeof(args)))
+ return -EFAULT;
+
+ if (args.flags &
+ ~(PIN_LONGTERM_TEST_FLAG_USE_WRITE|PIN_LONGTERM_TEST_FLAG_USE_FAST))
+ return -EINVAL;
+ if (!IS_ALIGNED(args.addr | args.size, PAGE_SIZE))
+ return -EINVAL;
+ if (args.size > LONG_MAX)
+ return -EINVAL;
+ nr_pages = args.size / PAGE_SIZE;
+ if (!nr_pages)
+ return -EINVAL;
+
+ pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (args.flags & PIN_LONGTERM_TEST_FLAG_USE_WRITE)
+ gup_flags |= FOLL_WRITE;
+ fast = !!(args.flags & PIN_LONGTERM_TEST_FLAG_USE_FAST);
+
+ if (!fast && mmap_read_lock_killable(current->mm)) {
+ kfree(pages);
+ return -EINTR;
+ }
+
+ pin_longterm_test_pages = pages;
+ pin_longterm_test_nr_pages = 0;
+
+ while (nr_pages - pin_longterm_test_nr_pages) {
+ remaining_pages = nr_pages - pin_longterm_test_nr_pages;
+ addr = args.addr + pin_longterm_test_nr_pages * PAGE_SIZE;
+
+ if (fast)
+ cur_pages = pin_user_pages_fast(addr, remaining_pages,
+ gup_flags, pages);
+ else
+ cur_pages = pin_user_pages(addr, remaining_pages,
+ gup_flags, pages, NULL);
+ if (cur_pages < 0) {
+ pin_longterm_test_stop();
+ ret = cur_pages;
+ break;
+ }
+ pin_longterm_test_nr_pages += cur_pages;
+ pages += cur_pages;
+ }
+
+ if (!fast)
+ mmap_read_unlock(current->mm);
+ return ret;
+}
+
+static inline int pin_longterm_test_read(unsigned long arg)
+{
+ __u64 user_addr;
+ unsigned long i;
+
+ if (!pin_longterm_test_pages)
+ return -EINVAL;
+
+ if (copy_from_user(&user_addr, (void __user *)arg, sizeof(user_addr)))
+ return -EFAULT;
+
+ for (i = 0; i < pin_longterm_test_nr_pages; i++) {
+ void *addr = page_to_virt(pin_longterm_test_pages[i]);
+
+ if (copy_to_user((void __user *)(unsigned long)user_addr, addr,
+ PAGE_SIZE))
+ return -EFAULT;
+ user_addr += PAGE_SIZE;
+ }
+ return 0;
+}
+
+static long pin_longterm_test_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+
+ if (mutex_lock_killable(&pin_longterm_test_mutex))
+ return -EINTR;
+
+ switch (cmd) {
+ case PIN_LONGTERM_TEST_START:
+ ret = pin_longterm_test_start(arg);
+ break;
+ case PIN_LONGTERM_TEST_STOP:
+ pin_longterm_test_stop();
+ ret = 0;
+ break;
+ case PIN_LONGTERM_TEST_READ:
+ ret = pin_longterm_test_read(arg);
+ break;
+ }
+
+ mutex_unlock(&pin_longterm_test_mutex);
+ return ret;
+}
+
static long gup_test_ioctl(struct file *filep, unsigned int cmd,
unsigned long arg)
{
@@ -217,6 +346,10 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd,
case PIN_BASIC_TEST:
case DUMP_USER_PAGES_TEST:
break;
+ case PIN_LONGTERM_TEST_START:
+ case PIN_LONGTERM_TEST_STOP:
+ case PIN_LONGTERM_TEST_READ:
+ return pin_longterm_test_ioctl(filep, cmd, arg);
default:
return -EINVAL;
}
@@ -234,9 +367,17 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd,
return 0;
}
+static int gup_test_release(struct inode *inode, struct file *file)
+{
+ pin_longterm_test_stop();
+
+ return 0;
+}
+
static const struct file_operations gup_test_fops = {
.open = nonseekable_open,
.unlocked_ioctl = gup_test_ioctl,
+ .release = gup_test_release,
};
static int __init gup_test_init(void)
diff --git a/mm/gup_test.h b/mm/gup_test.h
index 887ac1d5f5bc..5b37b54e8bea 100644
--- a/mm/gup_test.h
+++ b/mm/gup_test.h
@@ -10,6 +10,9 @@
#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test)
#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test)
#define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test)
+#define PIN_LONGTERM_TEST_START _IOW('g', 7, struct pin_longterm_test)
+#define PIN_LONGTERM_TEST_STOP _IO('g', 8)
+#define PIN_LONGTERM_TEST_READ _IOW('g', 9, __u64)
#define GUP_TEST_MAX_PAGES_TO_DUMP 8
@@ -30,4 +33,13 @@ struct gup_test {
__u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP];
};
+#define PIN_LONGTERM_TEST_FLAG_USE_WRITE 1
+#define PIN_LONGTERM_TEST_FLAG_USE_FAST 2
+
+struct pin_longterm_test {
+ __u64 addr;
+ __u64 size;
+ __u32 flags;
+};
+
#endif /* __GUP_TEST_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 811d19b5c4f6..b26998d1845f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2715,7 +2715,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* split PMDs
*/
if (!can_split_folio(folio, &extra_pins)) {
- ret = -EBUSY;
+ ret = -EAGAIN;
goto out_unlock;
}
@@ -2765,7 +2765,7 @@ fail:
xas_unlock(&xas);
local_irq_enable();
remap_page(folio, folio_nr_pages(folio));
- ret = -EBUSY;
+ ret = -EAGAIN;
}
out_unlock:
@@ -3069,28 +3069,28 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
mapping = candidate->f_mapping;
for (index = off_start; index < off_end; index += nr_pages) {
- struct page *fpage = pagecache_get_page(mapping, index,
- FGP_ENTRY | FGP_HEAD, 0);
+ struct folio *folio = __filemap_get_folio(mapping, index,
+ FGP_ENTRY, 0);
nr_pages = 1;
- if (xa_is_value(fpage) || !fpage)
+ if (xa_is_value(folio) || !folio)
continue;
- if (!is_transparent_hugepage(fpage))
+ if (!folio_test_large(folio))
goto next;
total++;
- nr_pages = thp_nr_pages(fpage);
+ nr_pages = folio_nr_pages(folio);
- if (!trylock_page(fpage))
+ if (!folio_trylock(folio))
goto next;
- if (!split_huge_page(fpage))
+ if (!split_folio(folio))
split++;
- unlock_page(fpage);
+ folio_unlock(folio);
next:
- put_page(fpage);
+ folio_put(folio);
cond_resched();
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e36ca75311a5..be09678d0582 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4777,7 +4777,6 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
hugepage_add_new_anon_rmap(new_page, vma, addr);
set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
- ClearHPageRestoreReserve(new_page);
SetHPageMigratable(new_page);
}
@@ -5117,7 +5116,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
/*
* If the pte was wr-protected by uffd-wp in any of the
* swap forms, meanwhile the caller does not want to
@@ -5129,7 +5127,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
else
-#endif
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
@@ -5158,13 +5155,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
-#endif
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
@@ -5445,8 +5440,6 @@ retry_avoidcopy:
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
- ClearHPageRestoreReserve(new_page);
-
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
@@ -5741,10 +5734,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (!pte_same(huge_ptep_get(ptep), old_pte))
goto backout;
- if (anon_rmap) {
- ClearHPageRestoreReserve(page);
+ if (anon_rmap)
hugepage_add_new_anon_rmap(page, vma, haddr);
- } else
+ else
page_dup_file_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
@@ -6131,12 +6123,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (page_in_pagecache) {
+ if (page_in_pagecache)
page_dup_file_rmap(page, true);
- } else {
- ClearHPageRestoreReserve(page);
+ else
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
- }
/*
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
@@ -6220,6 +6210,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
return false;
}
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & huge_page_mask(h);
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte, entry;
+
+ /*
+ * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+ * follow_hugetlb_page().
+ */
+ if (WARN_ON_ONCE(flags & FOLL_PIN))
+ return NULL;
+
+retry:
+ pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+ if (!pte)
+ return NULL;
+
+ ptl = huge_pte_lock(h, mm, pte);
+ entry = huge_ptep_get(pte);
+ if (pte_present(entry)) {
+ page = pte_page(entry) +
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ /*
+ * Note that page may be a sub-page, and with vmemmap
+ * optimizations the page struct may be read only.
+ * try_grab_page() will increase the ref count on the
+ * head page, so this will be OK.
+ *
+ * try_grab_page() should always succeed here, because we hold
+ * the ptl lock and have verified pte_present().
+ */
+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ page = NULL;
+ goto out;
+ }
+ } else {
+ if (is_hugetlb_entry_migration(entry)) {
+ spin_unlock(ptl);
+ __migration_entry_wait_huge(pte, ptl);
+ goto retry;
+ }
+ /*
+ * hwpoisoned entry is treated as no_page_table in
+ * follow_page_mask().
+ */
+ }
+out:
+ spin_unlock(ptl);
+ return page;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -7212,122 +7258,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
* These functions are overwritable if your architecture needs its own
* behavior.
*/
-struct page * __weak
-follow_huge_addr(struct mm_struct *mm, unsigned long address,
- int write)
-{
- return ERR_PTR(-EINVAL);
-}
-
-struct page * __weak
-follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd, int flags, int pdshift)
-{
- WARN(1, "hugepd follow called with no support for hugepage directory format\n");
- return NULL;
-}
-
-struct page * __weak
-follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
-{
- struct hstate *h = hstate_vma(vma);
- struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t *ptep, pte;
-
- /*
- * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
- * follow_hugetlb_page().
- */
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
-
-retry:
- ptep = huge_pte_offset(mm, address, huge_page_size(h));
- if (!ptep)
- return NULL;
-
- ptl = huge_pte_lock(h, mm, ptep);
- pte = huge_ptep_get(ptep);
- if (pte_present(pte)) {
- page = pte_page(pte) +
- ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
- /*
- * try_grab_page() should always succeed here, because: a) we
- * hold the pmd (ptl) lock, and b) we've just checked that the
- * huge pmd (head) page is present in the page tables. The ptl
- * prevents the head page and tail pages from being rearranged
- * in any way. So this page must be available at this point,
- * unless the page refcount overflowed:
- */
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
- page = NULL;
- goto out;
- }
- } else {
- if (is_hugetlb_entry_migration(pte)) {
- spin_unlock(ptl);
- __migration_entry_wait_huge(ptep, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
- }
-out:
- spin_unlock(ptl);
- return page;
-}
-
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
-{
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t pte;
-
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
-
-retry:
- ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
- if (!pud_huge(*pud))
- goto out;
- pte = huge_ptep_get((pte_t *)pud);
- if (pte_present(pte)) {
- page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
- page = NULL;
- goto out;
- }
- } else {
- if (is_hugetlb_entry_migration(pte)) {
- spin_unlock(ptl);
- __migration_entry_wait(mm, (pte_t *)pud, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
- }
-out:
- spin_unlock(ptl);
- return page;
-}
-
-struct page * __weak
-follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
-{
- if (flags & (FOLL_GET | FOLL_PIN))
- return NULL;
-
- return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
-}
-
int isolate_hugetlb(struct page *page, struct list_head *list)
{
int ret = 0;
@@ -7346,7 +7276,7 @@ unlock:
return ret;
}
-int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
{
int ret = 0;
@@ -7356,7 +7286,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
*hugetlb = true;
if (HPageFreed(page))
ret = 0;
- else if (HPageMigratable(page))
+ else if (HPageMigratable(page) || unpoison)
ret = get_page_unless_zero(page);
else
ret = -EBUSY;
@@ -7365,12 +7295,13 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
return ret;
}
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
int ret;
spin_lock_irq(&hugetlb_lock);
- ret = __get_huge_page_for_hwpoison(pfn, flags);
+ ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
spin_unlock_irq(&hugetlb_lock);
return ret;
}
diff --git a/mm/internal.h b/mm/internal.h
index 6b7ef495b56d..bcf75a8b032d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -106,9 +106,9 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
force_page_cache_ra(&ractl, nr_to_read);
}
-unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
-unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
@@ -708,14 +708,6 @@ extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
-#ifdef CONFIG_MEMORY_FAILURE
-void clear_hwpoisoned_pages(struct page *memmap, int nr_pages);
-#else
-static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
-}
-#endif
-
extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index abbcc1b0eec5..a84491bc4867 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -261,14 +261,6 @@ struct kasan_stack_ring {
#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-/* Used in KUnit-compatible KASAN tests. */
-struct kunit_kasan_status {
- bool report_found;
- bool sync_fault;
-};
-#endif
-
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 0d59098f0876..e27591ef2777 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -5,8 +5,12 @@
* Author: Andrey Ryabinin <a.ryabinin@samsung.com>
*/
+#define pr_fmt(fmt) "kasan_test: " fmt
+
+#include <kunit/test.h>
#include <linux/bitops.h>
#include <linux/delay.h>
+#include <linux/io.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -14,21 +18,28 @@
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/random.h>
+#include <linux/set_memory.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/tracepoint.h>
#include <linux/uaccess.h>
-#include <linux/io.h>
#include <linux/vmalloc.h>
-#include <linux/set_memory.h>
+#include <trace/events/printk.h>
#include <asm/page.h>
-#include <kunit/test.h>
-
#include "kasan.h"
#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
+static bool multishot;
+
+/* Fields set based on lines observed in the console. */
+static struct {
+ bool report_found;
+ bool async_fault;
+} test_status;
+
/*
* Some tests use these global variables to store return values from function
* calls that could otherwise be eliminated by the compiler as dead code.
@@ -36,35 +47,61 @@
void *kasan_ptr_result;
int kasan_int_result;
-static struct kunit_resource resource;
-static struct kunit_kasan_status test_status;
-static bool multishot;
+/* Probe for console output: obtains test_status lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ if (strnstr(buf, "BUG: KASAN: ", len))
+ WRITE_ONCE(test_status.report_found, true);
+ else if (strnstr(buf, "Asynchronous fault: ", len))
+ WRITE_ONCE(test_status.async_fault, true);
+}
-/*
- * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the
- * first detected bug and panic the kernel if panic_on_warn is enabled. For
- * hardware tag-based KASAN also allow tag checking to be reenabled for each
- * test, see the comment for KUNIT_EXPECT_KASAN_FAIL().
- */
-static int kasan_test_init(struct kunit *test)
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ check_trace_callback_type_console(probe_console);
+ if (!strcmp(tp->name, "console"))
+ WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ if (!strcmp(tp->name, "console"))
+ tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+static int kasan_suite_init(struct kunit_suite *suite)
{
if (!kasan_enabled()) {
- kunit_err(test, "can't run KASAN tests with KASAN disabled");
+ pr_err("Can't run KASAN tests with KASAN disabled");
return -1;
}
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only
+ * report the first detected bug and panic the kernel if panic_on_warn
+ * is enabled.
+ */
multishot = kasan_save_enable_multi_shot();
- test_status.report_found = false;
- test_status.sync_fault = false;
- kunit_add_named_resource(test, NULL, NULL, &resource,
- "kasan_status", &test_status);
+
+ /*
+ * Because we want to be able to build the test as a module, we need to
+ * iterate through all known tracepoints, since the static registration
+ * won't work here.
+ */
+ for_each_kernel_tracepoint(register_tracepoints, NULL);
return 0;
}
-static void kasan_test_exit(struct kunit *test)
+static void kasan_suite_exit(struct kunit_suite *suite)
{
kasan_restore_multi_shot(multishot);
- KUNIT_EXPECT_FALSE(test, test_status.report_found);
+ for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+static void kasan_test_exit(struct kunit *test)
+{
+ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found));
}
/**
@@ -106,11 +143,12 @@ static void kasan_test_exit(struct kunit *test)
if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
kasan_sync_fault_possible()) { \
if (READ_ONCE(test_status.report_found) && \
- READ_ONCE(test_status.sync_fault)) \
+ !READ_ONCE(test_status.async_fault)) \
kasan_enable_tagging(); \
migrate_enable(); \
} \
WRITE_ONCE(test_status.report_found, false); \
+ WRITE_ONCE(test_status.async_fault, false); \
} while (0)
#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \
@@ -1103,6 +1141,67 @@ static void kmalloc_double_kzfree(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr));
}
+/*
+ * The two tests below check that Generic KASAN prints auxiliary stack traces
+ * for RCU callbacks and workqueues. The reports need to be inspected manually.
+ *
+ * These tests are still enabled for other KASAN modes to make sure that all
+ * modes report bad accesses in tested scenarios.
+ */
+
+static struct kasan_rcu_info {
+ int i;
+ struct rcu_head rcu;
+} *global_rcu_ptr;
+
+static void rcu_uaf_reclaim(struct rcu_head *rp)
+{
+ struct kasan_rcu_info *fp =
+ container_of(rp, struct kasan_rcu_info, rcu);
+
+ kfree(fp);
+ ((volatile struct kasan_rcu_info *)fp)->i;
+}
+
+static void rcu_uaf(struct kunit *test)
+{
+ struct kasan_rcu_info *ptr;
+
+ ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ global_rcu_ptr = rcu_dereference_protected(
+ (struct kasan_rcu_info __rcu *)ptr, NULL);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim);
+ rcu_barrier());
+}
+
+static void workqueue_uaf_work(struct work_struct *work)
+{
+ kfree(work);
+}
+
+static void workqueue_uaf(struct kunit *test)
+{
+ struct workqueue_struct *workqueue;
+ struct work_struct *work;
+
+ workqueue = create_workqueue("kasan_workqueue_test");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, workqueue);
+
+ work = kmalloc(sizeof(struct work_struct), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, work);
+
+ INIT_WORK(work, workqueue_uaf_work);
+ queue_work(workqueue, work);
+ destroy_workqueue(workqueue);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ((volatile struct work_struct *)work)->data);
+}
+
static void vmalloc_helpers_tags(struct kunit *test)
{
void *ptr;
@@ -1434,6 +1533,8 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kasan_bitops_generic),
KUNIT_CASE(kasan_bitops_tags),
KUNIT_CASE(kmalloc_double_kzfree),
+ KUNIT_CASE(rcu_uaf),
+ KUNIT_CASE(workqueue_uaf),
KUNIT_CASE(vmalloc_helpers_tags),
KUNIT_CASE(vmalloc_oob),
KUNIT_CASE(vmap_tags),
@@ -1447,9 +1548,10 @@ static struct kunit_case kasan_kunit_test_cases[] = {
static struct kunit_suite kasan_kunit_test_suite = {
.name = "kasan",
- .init = kasan_test_init,
.test_cases = kasan_kunit_test_cases,
.exit = kasan_test_exit,
+ .suite_init = kasan_suite_init,
+ .suite_exit = kasan_suite_exit,
};
kunit_test_suite(kasan_kunit_test_suite);
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
index e4ca82dc2c16..7be7bed456ef 100644
--- a/mm/kasan/kasan_test_module.c
+++ b/mm/kasan/kasan_test_module.c
@@ -62,64 +62,6 @@ static noinline void __init copy_user_test(void)
kfree(kmem);
}
-static struct kasan_rcu_info {
- int i;
- struct rcu_head rcu;
-} *global_rcu_ptr;
-
-static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp)
-{
- struct kasan_rcu_info *fp = container_of(rp,
- struct kasan_rcu_info, rcu);
-
- kfree(fp);
- ((volatile struct kasan_rcu_info *)fp)->i;
-}
-
-static noinline void __init kasan_rcu_uaf(void)
-{
- struct kasan_rcu_info *ptr;
-
- pr_info("use-after-free in kasan_rcu_reclaim\n");
- ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- global_rcu_ptr = rcu_dereference_protected(ptr, NULL);
- call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim);
-}
-
-static noinline void __init kasan_workqueue_work(struct work_struct *work)
-{
- kfree(work);
-}
-
-static noinline void __init kasan_workqueue_uaf(void)
-{
- struct workqueue_struct *workqueue;
- struct work_struct *work;
-
- workqueue = create_workqueue("kasan_wq_test");
- if (!workqueue) {
- pr_err("Allocation failed\n");
- return;
- }
- work = kmalloc(sizeof(struct work_struct), GFP_KERNEL);
- if (!work) {
- pr_err("Allocation failed\n");
- return;
- }
-
- INIT_WORK(work, kasan_workqueue_work);
- queue_work(workqueue, work);
- destroy_workqueue(workqueue);
-
- pr_info("use-after-free on workqueue\n");
- ((volatile struct work_struct *)work)->data;
-}
-
static int __init test_kasan_module_init(void)
{
/*
@@ -130,8 +72,6 @@ static int __init test_kasan_module_init(void)
bool multishot = kasan_save_enable_multi_shot();
copy_user_test();
- kasan_rcu_uaf();
- kasan_workqueue_uaf();
kasan_restore_multi_shot(multishot);
return -EAGAIN;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index df3602062bfd..31355851a5ec 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -30,8 +30,6 @@
#include <asm/sections.h>
-#include <kunit/test.h>
-
#include "kasan.h"
#include "../slab.h"
@@ -114,41 +112,12 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
#endif
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-static void update_kunit_status(bool sync)
-{
- struct kunit *test;
- struct kunit_resource *resource;
- struct kunit_kasan_status *status;
-
- test = current->kunit_test;
- if (!test)
- return;
-
- resource = kunit_find_named_resource(test, "kasan_status");
- if (!resource) {
- kunit_set_failure(test);
- return;
- }
-
- status = (struct kunit_kasan_status *)resource->data;
- WRITE_ONCE(status->report_found, true);
- WRITE_ONCE(status->sync_fault, sync);
-
- kunit_put_resource(resource);
-}
-#else
-static void update_kunit_status(bool sync) { }
-#endif
-
static DEFINE_SPINLOCK(report_lock);
static void start_report(unsigned long *flags, bool sync)
{
/* Respect the /proc/sys/kernel/traceoff_on_warning interface. */
disable_trace_on_warning();
- /* Update status of the currently running KASAN test. */
- update_kunit_status(sync);
/* Do not allow LOCKDEP mangling KASAN reports. */
lockdep_off();
/* Make sure we don't end up in loop. */
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 0e3648b603a6..2fba1f51f042 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -244,7 +244,7 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
static int __init kasan_memhotplug_init(void)
{
- hotplug_memory_notifier(kasan_mem_notifier, 0);
+ hotplug_memory_notifier(kasan_mem_notifier, DEFAULT_CALLBACK_PRI);
return 0;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index c19fcca9bc03..7ba97f86d831 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3211,7 +3211,7 @@ static int __init ksm_init(void)
#ifdef CONFIG_MEMORY_HOTREMOVE
/* There is no significance to this priority 100 */
- hotplug_memory_notifier(ksm_memory_callback, 100);
+ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
#endif
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a1a35c12635e..c95e2ed6e7fd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1219,7 +1219,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
* cgroup root (root_mem_cgroup). So we have to handle
* dead_memcg from cgroup root separately.
*/
- if (last != root_mem_cgroup)
+ if (!mem_cgroup_is_root(last))
__invalidate_reclaim_iterators(root_mem_cgroup,
dead_memcg);
}
@@ -1243,7 +1243,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct mem_cgroup *iter;
int ret = 0;
- BUG_ON(memcg == root_mem_cgroup);
+ BUG_ON(mem_cgroup_is_root(memcg));
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1272,7 +1272,7 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
memcg = folio_memcg(folio);
if (!memcg)
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
+ VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
else
VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
}
@@ -2036,7 +2036,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
rcu_read_lock();
memcg = mem_cgroup_from_task(victim);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
/*
@@ -2995,7 +2995,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg = NULL;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
objcg = rcu_dereference(memcg->objcg);
if (objcg && obj_cgroup_tryget(objcg))
break;
@@ -5648,15 +5648,21 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
+ unsigned long index;
+ struct folio *folio;
+
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- /* page is moved even if it's not RSS of this task(page-faulted). */
+ /* folio is moved even if it's not RSS of this task(page-faulted). */
/* shmem/tmpfs may report page out on swap: account for that too. */
- return find_get_incore_page(vma->vm_file->f_mapping,
- linear_page_index(vma, addr));
+ index = linear_page_index(vma, addr);
+ folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
+ if (!folio)
+ return NULL;
+ return folio_file_page(folio, index);
}
/**
@@ -7163,7 +7169,7 @@ void mem_cgroup_sk_alloc(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
@@ -7298,7 +7304,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
*/
- if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
+ if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
VM_BUG_ON(1);
break;
}
@@ -7462,7 +7468,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
if (mem_cgroup_disabled() || do_memsw_account())
return nr_swap_pages;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
@@ -7484,7 +7490,7 @@ bool mem_cgroup_swap_full(struct folio *folio)
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
@@ -7648,7 +7654,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
return true;
original_memcg = get_mem_cgroup_from_objcg(objcg);
- for (memcg = original_memcg; memcg != root_mem_cgroup;
+ for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
memcg = parent_mem_cgroup(memcg)) {
unsigned long max = READ_ONCE(memcg->zswap_max);
unsigned long pages;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index bead6bccc7f2..779a426d2cab 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -74,6 +74,19 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool hw_memory_failure __read_mostly = false;
+inline void num_poisoned_pages_inc(unsigned long pfn)
+{
+ atomic_long_inc(&num_poisoned_pages);
+ memblk_nr_poison_inc(pfn);
+}
+
+inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+{
+ atomic_long_sub(i, &num_poisoned_pages);
+ if (pfn != -1UL)
+ memblk_nr_poison_sub(pfn, i);
+}
+
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -115,7 +128,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
if (release)
put_page(page);
page_ref_inc(page);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(page_to_pfn(page));
return true;
}
@@ -1182,14 +1195,16 @@ static struct page_state error_states[] = {
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
-static void action_result(unsigned long pfn, enum mf_action_page_type type,
- enum mf_result result)
+static int action_result(unsigned long pfn, enum mf_action_page_type type,
+ enum mf_result result)
{
trace_memory_failure_event(pfn, type, result);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(pfn);
pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
+
+ return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
static int page_action(struct page_state *ps, struct page *p,
@@ -1200,14 +1215,12 @@ static int page_action(struct page_state *ps, struct page *p,
/* page p should be unlocked after returning from ps->action(). */
result = ps->action(ps, p);
- action_result(pfn, ps->type, result);
-
/* Could do more checks here if page looks ok */
/*
* Could adjust zone counters here to correct for the missing page.
*/
- return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
+ return action_result(pfn, ps->type, result);
}
static inline bool PageHWPoisonTakenOff(struct page *page)
@@ -1247,7 +1260,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
int ret = 0;
bool hugetlb = false;
- ret = get_hwpoison_huge_page(head, &hugetlb);
+ ret = get_hwpoison_huge_page(head, &hugetlb, false);
if (hugetlb)
return ret;
@@ -1337,7 +1350,7 @@ static int __get_unpoison_page(struct page *page)
int ret = 0;
bool hugetlb = false;
- ret = get_hwpoison_huge_page(head, &hugetlb);
+ ret = get_hwpoison_huge_page(head, &hugetlb, true);
if (hugetlb)
return ret;
@@ -1699,6 +1712,8 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
if (move_flag)
SetPageHWPoison(p->page);
+ else
+ num_poisoned_pages_sub(page_to_pfn(p->page), 1);
kfree(p);
count++;
}
@@ -1734,7 +1749,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
llist_add(&raw_hwp->node, head);
/* the first error event will be counted in action_result(). */
if (ret)
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(page_to_pfn(page));
} else {
/*
* Failed to save raw error info. We no longer trace all
@@ -1788,7 +1803,8 @@ void hugetlb_clear_page_hwpoison(struct page *hpage)
* -EBUSY - the hugepage is busy (try to retry)
* -EHWPOISON - the hugepage is already hwpoisoned
*/
-int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
struct page *page = pfn_to_page(pfn);
struct page *head = compound_head(page);
@@ -1818,6 +1834,15 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
goto out;
}
+ /*
+ * Clearing HPageMigratable for hwpoisoned hugepages to prevent them
+ * from being migrated by memory hotremove.
+ */
+ if (count_increased && HPageMigratable(head)) {
+ ClearHPageMigratable(head);
+ *migratable_cleared = true;
+ }
+
return ret;
out:
if (count_increased)
@@ -1837,10 +1862,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
struct page *p = pfn_to_page(pfn);
struct page *head;
unsigned long page_flags;
+ bool migratable_cleared = false;
*hugetlb = 1;
retry:
- res = get_huge_page_for_hwpoison(pfn, flags);
+ res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
if (res == 2) { /* fallback to normal page handling */
*hugetlb = 0;
return 0;
@@ -1856,8 +1882,7 @@ retry:
flags |= MF_NO_RETRY;
goto retry;
}
- action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
- return res;
+ return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
}
head = compound_head(p);
@@ -1865,6 +1890,8 @@ retry:
if (hwpoison_filter(p)) {
hugetlb_clear_page_hwpoison(head);
+ if (migratable_cleared)
+ SetHPageMigratable(head);
unlock_page(head);
if (res == 1)
put_page(head);
@@ -1883,22 +1910,17 @@ retry:
} else {
res = MF_FAILED;
}
- action_result(pfn, MF_MSG_FREE_HUGE, res);
- return res == MF_RECOVERED ? 0 : -EBUSY;
+ return action_result(pfn, MF_MSG_FREE_HUGE, res);
}
page_flags = head->flags;
if (!hwpoison_user_mappings(p, pfn, flags, head)) {
- action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
- res = -EBUSY;
- goto out;
+ unlock_page(head);
+ return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
}
return identify_page_state(pfn, p, page_flags);
-out:
- unlock_page(head);
- return res;
}
#else
@@ -1913,17 +1935,25 @@ static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
}
#endif /* CONFIG_HUGETLB_PAGE */
+/* Drop the extra refcount in case we come from madvise() */
+static void put_ref_page(unsigned long pfn, int flags)
+{
+ struct page *page;
+
+ if (!(flags & MF_COUNT_INCREASED))
+ return;
+
+ page = pfn_to_page(pfn);
+ if (page)
+ put_page(page);
+}
+
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap)
{
- struct page *page = pfn_to_page(pfn);
int rc = -ENXIO;
- if (flags & MF_COUNT_INCREASED)
- /*
- * Drop the extra refcount in case we come from madvise().
- */
- put_page(page);
+ put_ref_page(pfn, flags);
/* device metadata space is not recoverable */
if (!pgmap_pfn_valid(pgmap, pfn))
@@ -2055,16 +2085,13 @@ try_again:
}
res = MF_FAILED;
}
- action_result(pfn, MF_MSG_BUDDY, res);
- res = res == MF_RECOVERED ? 0 : -EBUSY;
+ res = action_result(pfn, MF_MSG_BUDDY, res);
} else {
- action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
}
goto unlock_mutex;
} else if (res < 0) {
- action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
goto unlock_mutex;
}
}
@@ -2085,8 +2112,7 @@ try_again:
*/
SetPageHasHWPoisoned(hpage);
if (try_to_split_thp_page(p) < 0) {
- action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
goto unlock_mutex;
}
VM_BUG_ON_PAGE(!page_count(p), p);
@@ -2119,8 +2145,7 @@ try_again:
retry = false;
goto try_again;
}
- action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
goto unlock_page;
}
@@ -2160,8 +2185,7 @@ try_again:
* Abort on fail: __filemap_remove_folio() assumes unmapped page.
*/
if (!hwpoison_user_mappings(p, pfn, flags, p)) {
- action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
goto unlock_page;
}
@@ -2169,8 +2193,7 @@ try_again:
* Torn down by someone else?
*/
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
- action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
goto unlock_page;
}
@@ -2317,6 +2340,7 @@ int unpoison_memory(unsigned long pfn)
int ret = -EBUSY;
int freeit = 0;
unsigned long count = 1;
+ bool huge = false;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -2365,6 +2389,7 @@ int unpoison_memory(unsigned long pfn)
ret = get_hwpoison_page(p, MF_UNPOISON);
if (!ret) {
if (PageHuge(p)) {
+ huge = true;
count = free_raw_hwp_pages(page, false);
if (count == 0) {
ret = -EBUSY;
@@ -2380,6 +2405,7 @@ int unpoison_memory(unsigned long pfn)
pfn, &unpoison_rs);
} else {
if (PageHuge(p)) {
+ huge = true;
count = free_raw_hwp_pages(page, false);
if (count == 0) {
ret = -EBUSY;
@@ -2399,7 +2425,8 @@ int unpoison_memory(unsigned long pfn)
unlock_mutex:
mutex_unlock(&mf_mutex);
if (!ret || freeit) {
- num_poisoned_pages_sub(count);
+ if (!huge)
+ num_poisoned_pages_sub(pfn, 1);
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
page_to_pfn(p), &unpoison_rs);
}
@@ -2516,12 +2543,6 @@ static int soft_offline_in_use_page(struct page *page)
return ret;
}
-static void put_ref_page(struct page *page)
-{
- if (page)
- put_page(page);
-}
-
/**
* soft_offline_page - Soft offline a page.
* @pfn: pfn to soft-offline
@@ -2550,19 +2571,17 @@ int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
bool try_again = true;
- struct page *page, *ref_page = NULL;
-
- WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
+ struct page *page;
- if (!pfn_valid(pfn))
+ if (!pfn_valid(pfn)) {
+ WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
return -ENXIO;
- if (flags & MF_COUNT_INCREASED)
- ref_page = pfn_to_page(pfn);
+ }
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
page = pfn_to_online_page(pfn);
if (!page) {
- put_ref_page(ref_page);
+ put_ref_page(pfn, flags);
return -EIO;
}
@@ -2570,7 +2589,7 @@ int soft_offline_page(unsigned long pfn, int flags)
if (PageHWPoison(page)) {
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
- put_ref_page(ref_page);
+ put_ref_page(pfn, flags);
mutex_unlock(&mf_mutex);
return 0;
}
@@ -2602,26 +2621,3 @@ retry:
return ret;
}
-
-void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
- int i, total = 0;
-
- /*
- * A further optimization is to have per section refcounted
- * num_poisoned_pages. But that would need more space per memmap, so
- * for now just do a quick global check to speed up this routine in the
- * absence of bad pages.
- */
- if (atomic_long_read(&num_poisoned_pages) == 0)
- return;
-
- for (i = 0; i < nr_pages; i++) {
- if (PageHWPoison(&memmap[i])) {
- total++;
- ClearPageHWPoison(&memmap[i]);
- }
- }
- if (total)
- num_poisoned_pages_sub(total);
-}
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index fa8c9d07f9ce..939e200c283b 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -664,7 +664,7 @@ static int __init memory_tier_init(void)
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
- hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
+ hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
return 0;
}
subsys_initcall(memory_tier_init);
diff --git a/mm/memory.c b/mm/memory.c
index 8c8420934d60..7826143ec9cd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1384,12 +1384,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte,
struct zap_details *details, pte_t pteval)
{
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (zap_drop_file_uffd_wp(details))
return;
pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
-#endif
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
diff --git a/mm/migrate.c b/mm/migrate.c
index dff333593a8a..f8c85b42e2bc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1506,9 +1506,22 @@ thp_subpage_migration:
if (is_thp) {
nr_thp_failed++;
/* THP NUMA faulting doesn't split THP to retry. */
- if (!nosplit && !try_split_thp(page, &thp_split_pages)) {
- nr_thp_split++;
- break;
+ if (!nosplit) {
+ int ret = try_split_thp(page, &thp_split_pages);
+
+ if (!ret) {
+ nr_thp_split++;
+ break;
+ } else if (reason == MR_LONGTERM_PIN &&
+ ret == -EAGAIN) {
+ /*
+ * Try again to split THP to mitigate
+ * the failure of longterm pinning.
+ */
+ thp_retry++;
+ nr_retry_pages += nr_subpages;
+ break;
+ }
}
} else if (!no_subpage_counting) {
nr_failed++;
@@ -1620,7 +1633,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
nid = folio_nid(folio);
if (folio_test_hugetlb(folio)) {
- struct hstate *h = page_hstate(&folio->page);
+ struct hstate *h = folio_hstate(folio);
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
diff --git a/mm/mincore.c b/mm/mincore.c
index fa200c14185f..a085a2aeabd8 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -52,7 +52,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
{
unsigned char present = 0;
- struct page *page;
+ struct folio *folio;
/*
* When tmpfs swaps out a page from a file, any process mapping that
@@ -60,10 +60,10 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- page = find_get_incore_page(mapping, index);
- if (page) {
- present = PageUptodate(page);
- put_page(page);
+ folio = filemap_get_incore_folio(mapping, index);
+ if (folio) {
+ present = folio_test_uptodate(folio);
+ folio_put(folio);
}
return present;
@@ -190,8 +190,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
unsigned long end;
int err;
- vma = find_vma(current->mm, addr);
- if (!vma || addr < vma->vm_start)
+ vma = vma_lookup(current->mm, addr);
+ if (!vma)
return -ENOMEM;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
if (!can_do_mincore(vma)) {
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 0d7b2bd2454a..c1883362e71d 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -178,16 +178,10 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block compute_batch_nb __meminitdata = {
- .notifier_call = mm_compute_batch_notifier,
- .priority = IPC_CALLBACK_PRI, /* use lowest priority */
-};
-
static int __init mm_compute_batch_init(void)
{
mm_compute_batch(sysctl_overcommit_memory);
- register_hotmemory_notifier(&compute_batch_nb);
-
+ hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
return 0;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 74a84eb33b90..4624ff3ded29 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3753,13 +3753,9 @@ static int reserve_mem_notifier(struct notifier_block *nb,
return NOTIFY_OK;
}
-static struct notifier_block reserve_mem_nb = {
- .notifier_call = reserve_mem_notifier,
-};
-
static int __meminit init_reserve_notifier(void)
{
- if (register_hotmemory_notifier(&reserve_mem_nb))
+ if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
pr_err("Failed registering memory add/remove notifier for admin reserve\n");
return 0;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 668bfaa6ed2a..8d770855b591 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -267,7 +267,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
} else {
/* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
/*
* For file-backed mem, we need to be able to
@@ -279,7 +278,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
make_pte_marker(PTE_MARKER_UFFD_WP));
pages++;
}
-#endif
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -756,8 +754,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
* If a permission is not passed to mprotect(), it must be
* cleared from the VMA.
*/
- mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
- VM_FLAGS_CLEAR;
+ mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
newflags = calc_vm_prot_bits(prot, new_vma_pkey);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index ddf1968560f0..4ee522fd381c 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -513,7 +513,7 @@ void __init page_ext_init(void)
cond_resched();
}
}
- hotplug_memory_notifier(page_ext_callback, 0);
+ hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
pr_info("allocated %ld bytes of page_ext\n", total_usage);
invoke_init_callbacks();
return;
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ec925e5fa6a..3b2d18bbdc44 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -315,8 +315,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
enomem_failure:
/*
- * dst->anon_vma is dropped here otherwise its degree can be incorrectly
- * decremented in unlink_anon_vmas().
+ * dst->anon_vma is dropped here otherwise its num_active_vmas can
+ * be incorrectly decremented in unlink_anon_vmas().
* We can safely do this because callers of anon_vma_clone() don't care
* about dst->anon_vma if anon_vma_clone() failed.
*/
@@ -1801,7 +1801,7 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return vma_is_temporary_stack(vma);
}
-static int page_not_mapped(struct folio *folio)
+static int folio_not_mapped(struct folio *folio)
{
return !folio_mapped(folio);
}
@@ -1822,7 +1822,7 @@ void try_to_unmap(struct folio *folio, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
- .done = page_not_mapped,
+ .done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
};
@@ -2150,7 +2150,7 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_migrate_one,
.arg = (void *)flags,
- .done = page_not_mapped,
+ .done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
};
@@ -2297,7 +2297,7 @@ static bool folio_make_device_exclusive(struct folio *folio,
};
struct rmap_walk_control rwc = {
.rmap_one = page_make_device_exclusive_one,
- .done = page_not_mapped,
+ .done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
.arg = &args,
};
@@ -2571,7 +2571,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(compound_mapcount_ptr(page), 0);
atomic_set(compound_pincount_ptr(page), 0);
-
+ ClearHPageRestoreReserve(page);
__page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index c1d8b8a1aa3b..0a7c4a748811 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -922,21 +922,18 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
folio_batch_init(&fbatch);
index = start;
- while (index < end && find_lock_entries(mapping, index, end - 1,
+ while (index < end && find_lock_entries(mapping, &index, end - 1,
&fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
- index = indices[i];
-
if (xa_is_value(folio)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
- index, folio);
+ indices[i], folio);
continue;
}
- index += folio_nr_pages(folio) - 1;
if (!unfalloc || !folio_test_uptodate(folio))
truncate_inode_folio(mapping, folio);
@@ -945,7 +942,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
@@ -977,7 +973,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &fbatch,
+ if (!find_get_entries(mapping, &index, end - 1, &fbatch,
indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
@@ -989,13 +985,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
- index = indices[i];
if (xa_is_value(folio)) {
if (unfalloc)
continue;
- if (shmem_free_swap(mapping, index, folio)) {
+ if (shmem_free_swap(mapping, indices[i], folio)) {
/* Swap was replaced by page: retry */
- index--;
+ index = indices[i];
break;
}
nr_swaps_freed++;
@@ -1008,19 +1003,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
folio_unlock(folio);
- index--;
+ index = indices[i];
break;
}
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
folio);
truncate_inode_folio(mapping, folio);
}
- index = folio->index + folio_nr_pages(folio) - 1;
folio_unlock(folio);
}
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
- index++;
}
spin_lock_irq(&info->lock);
@@ -1833,7 +1826,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
struct folio *folio;
- pgoff_t hindex = index;
+ pgoff_t hindex;
gfp_t huge_gfp;
int error;
int once = 0;
@@ -1871,7 +1864,6 @@ repeat:
}
if (folio) {
- hindex = folio->index;
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
if (folio_test_uptodate(folio))
@@ -3910,6 +3902,7 @@ EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
+ .open = generic_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
diff --git a/mm/slub.c b/mm/slub.c
index 157527d7101b..f37e6a51e233 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4771,11 +4771,6 @@ static int slab_memory_callback(struct notifier_block *self,
return ret;
}
-static struct notifier_block slab_memory_callback_nb = {
- .notifier_call = slab_memory_callback,
- .priority = SLAB_CALLBACK_PRI,
-};
-
/********************************************************************
* Basic setup of slabs
*******************************************************************/
@@ -4841,7 +4836,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
- register_hotmemory_notifier(&slab_memory_callback_nb);
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
diff --git a/mm/sparse.c b/mm/sparse.c
index e5a8a3a0edd7..2779b419ef2a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -926,8 +926,6 @@ void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
unsigned long nr_pages, unsigned long map_offset,
struct vmem_altmap *altmap)
{
- clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
- nr_pages - map_offset);
section_deactivate(pfn, nr_pages, altmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 955930f41d20..b9a6817e07ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -43,8 +43,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>
-/* How many pages do we try to swap or page in/out together? */
+/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
+const int page_cluster_max = 31;
/* Protecting only lru_rotate.fbatch which requires disabling interrupts */
struct lru_rotate {
@@ -295,8 +296,20 @@ void folio_rotate_reclaimable(struct folio *folio)
}
}
-void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
+void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
{
+ unsigned long cost;
+
+ /*
+ * Reflect the relative cost of incurring IO and spending CPU
+ * time on rotations. This doesn't attempt to make a precise
+ * comparison, it just says: if reloads are about comparable
+ * between the LRU lists, or rotations are overwhelmingly
+ * different between them, adjust scan balance for CPU work.
+ */
+ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+
do {
unsigned long lrusize;
@@ -310,9 +323,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
- lruvec->file_cost += nr_pages;
+ lruvec->file_cost += cost;
else
- lruvec->anon_cost += nr_pages;
+ lruvec->anon_cost += cost;
/*
* Decay previous events
@@ -335,10 +348,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
} while ((lruvec = parent_lruvec(lruvec)));
}
-void lru_note_cost_folio(struct folio *folio)
+void lru_note_cost_refault(struct folio *folio)
{
lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
- folio_nr_pages(folio));
+ folio_nr_pages(folio), 0);
}
static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
diff --git a/mm/swap.h b/mm/swap.h
index cc08c459c619..f78065c8ef52 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,7 +41,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index);
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
@@ -105,9 +106,10 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
}
static inline
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index)
{
- return find_get_page(mapping, index);
+ return filemap_get_folio(mapping, index);
}
static inline bool add_to_swap(struct folio *folio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 438d0676c5be..40fe6f23e105 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -373,30 +373,28 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
}
/**
- * find_get_incore_page - Find and get a page from the page or swap caches.
+ * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
* @mapping: The address_space to search.
* @index: The page cache index.
*
- * This differs from find_get_page() in that it will also look for the
- * page in the swap cache.
+ * This differs from filemap_get_folio() in that it will also look for the
+ * folio in the swap cache.
*
- * Return: The found page or %NULL.
+ * Return: The found folio or %NULL.
*/
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index)
{
swp_entry_t swp;
struct swap_info_struct *si;
- struct page *page = pagecache_get_page(mapping, index,
- FGP_ENTRY | FGP_HEAD, 0);
+ struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0);
- if (!page)
- return page;
- if (!xa_is_value(page))
- return find_subpage(page, index);
+ if (!xa_is_value(folio))
+ goto out;
if (!shmem_mapping(mapping))
return NULL;
- swp = radix_to_swp_entry(page);
+ swp = radix_to_swp_entry(folio);
/* There might be swapin error entries in shmem mapping. */
if (non_swap_entry(swp))
return NULL;
@@ -404,9 +402,11 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
si = get_swap_device(swp);
if (!si)
return NULL;
- page = find_get_page(swap_address_space(swp), swp_offset(swp));
+ index = swp_offset(swp);
+ folio = filemap_get_folio(swap_address_space(swp), index);
put_swap_device(si);
- return page;
+out:
+ return folio;
}
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
diff --git a/mm/truncate.c b/mm/truncate.c
index c0be77e5c008..c7bfd247a651 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -361,9 +361,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
folio_batch_init(&fbatch);
index = start;
- while (index < end && find_lock_entries(mapping, index, end - 1,
+ while (index < end && find_lock_entries(mapping, &index, end - 1,
&fbatch, indices)) {
- index = indices[folio_batch_count(&fbatch) - 1] + 1;
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
for (i = 0; i < folio_batch_count(&fbatch); i++)
truncate_cleanup_folio(fbatch.folios[i]);
@@ -401,7 +400,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = start;
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &fbatch,
+ if (!find_get_entries(mapping, &index, end - 1, &fbatch,
indices)) {
/* If all gone from start onwards, we're done */
if (index == start)
@@ -415,21 +414,18 @@ void truncate_inode_pages_range(struct address_space *mapping,
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
- index = indices[i];
if (xa_is_value(folio))
continue;
folio_lock(folio);
- VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
folio_wait_writeback(folio);
truncate_inode_folio(mapping, folio);
folio_unlock(folio);
- index = folio_index(folio) + folio_nr_pages(folio) - 1;
}
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
folio_batch_release(&fbatch);
- index++;
}
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -510,20 +506,17 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
int i;
folio_batch_init(&fbatch);
- while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
+ while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
- index = indices[i];
if (xa_is_value(folio)) {
count += invalidate_exceptional_entry(mapping,
- index,
- folio);
+ indices[i], folio);
continue;
}
- index += folio_nr_pages(folio) - 1;
ret = mapping_evict_folio(mapping, folio);
folio_unlock(folio);
@@ -542,7 +535,6 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
return count;
}
@@ -641,16 +633,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
folio_batch_init(&fbatch);
index = start;
- while (find_get_entries(mapping, index, end, &fbatch, indices)) {
+ while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
- index = indices[i];
if (xa_is_value(folio)) {
if (!invalidate_exceptional_entry2(mapping,
- index, folio))
+ indices[i], folio))
ret = -EBUSY;
continue;
}
@@ -660,13 +651,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* If folio is mapped, before taking its lock,
* zap the rest of the file in one hit.
*/
- unmap_mapping_pages(mapping, index,
- (1 + end - index), false);
+ unmap_mapping_pages(mapping, indices[i],
+ (1 + end - indices[i]), false);
did_range_unmap = 1;
}
folio_lock(folio);
- VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
if (folio->mapping != mapping) {
folio_unlock(folio);
continue;
@@ -689,7 +680,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
/*
* For DAX we invalidate page tables after invalidating page cache. We
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ccaa461998f3..ca71de7c9d77 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -43,6 +43,9 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmalloc.h>
+
#include "internal.h"
#include "pgalloc-track.h"
@@ -1620,6 +1623,8 @@ retry:
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
@@ -1725,6 +1730,7 @@ static void purge_fragmented_blocks_allcpus(void);
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
unsigned long resched_threshold;
+ unsigned int num_purged_areas = 0;
struct list_head local_purge_list;
struct vmap_area *va, *n_va;
@@ -1736,7 +1742,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
spin_unlock(&purge_vmap_area_lock);
if (unlikely(list_empty(&local_purge_list)))
- return false;
+ goto out;
start = min(start,
list_first_entry(&local_purge_list,
@@ -1771,12 +1777,16 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
va->va_start, va->va_end);
atomic_long_sub(nr, &vmap_lazy_nr);
+ num_purged_areas++;
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
cond_resched_lock(&free_vmap_area_lock);
}
spin_unlock(&free_vmap_area_lock);
- return true;
+
+out:
+ trace_purge_vmap_area_lazy(start, end, num_purged_areas);
+ return num_purged_areas > 0;
}
/*
@@ -1811,6 +1821,8 @@ static void drain_vmap_area_work(struct work_struct *work)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
+ unsigned long nr_lazy_max = lazy_max_pages();
+ unsigned long va_start = va->va_start;
unsigned long nr_lazy;
spin_lock(&vmap_area_lock);
@@ -1828,8 +1840,10 @@ static void free_vmap_area_noflush(struct vmap_area *va)
&purge_vmap_area_root, &purge_vmap_area_list);
spin_unlock(&purge_vmap_area_lock);
+ trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
+
/* After this point, we may free va at any time */
- if (unlikely(nr_lazy > lazy_max_pages()))
+ if (unlikely(nr_lazy > nr_lazy_max))
schedule_work(&drain_vmap_work);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8fcc5fa768c0..55a5b5d66d68 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2499,7 +2499,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout);
+ lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
mem_cgroup_uncharge_list(&folio_list);
free_unref_page_list(&folio_list);
@@ -2651,6 +2651,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
+ if (nr_rotated)
+ lru_note_cost(lruvec, file, 0, nr_rotated);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
diff --git a/mm/workingset.c b/mm/workingset.c
index ae7e984b23c6..d2d02978588c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -493,7 +493,7 @@ void workingset_refault(struct folio *folio, void *shadow)
if (workingset) {
folio_set_workingset(folio);
/* XXX: Move to lru_cache_add() when it supports new vs putback */
- lru_note_cost_folio(folio);
+ lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
out:
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 7b9dc2426f18..8a536c731e3c 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
+anon_cow
hugepage-mmap
hugepage-mremap
hugepage-shm
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 163c2fde3cb3..00920cb8b499 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,7 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
# Makefile for vm selftests
-LOCAL_HDRS += $(top_srcdir)/mm/gup_test.h
+LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h
+
+include local_config.mk
uname_M := $(shell uname -m 2>/dev/null || echo not)
MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
@@ -25,7 +27,8 @@ MAKEFLAGS += --no-builtin-rules
CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
LDLIBS = -lrt -lpthread
-TEST_GEN_FILES = compaction_test
+TEST_GEN_FILES = anon_cow
+TEST_GEN_FILES += compaction_test
TEST_GEN_FILES += gup_test
TEST_GEN_FILES += hmm-tests
TEST_GEN_FILES += hugetlb-madvise
@@ -95,6 +98,7 @@ TEST_FILES += va_128TBswitch.sh
include ../lib.mk
+$(OUTPUT)/anon_cow: vm_util.c
$(OUTPUT)/khugepaged: vm_util.c
$(OUTPUT)/madv_populate: vm_util.c
$(OUTPUT)/soft-dirty: vm_util.c
@@ -150,8 +154,25 @@ warn_32bit_failure:
endif
endif
+# ANON_COW_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
+$(OUTPUT)/anon_cow: LDLIBS += $(ANON_COW_EXTRA_LIBS)
+
$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
$(OUTPUT)/migration: LDLIBS += -lnuma
+
+local_config.mk local_config.h: check_config.sh
+ /bin/sh ./check_config.sh $(CC)
+
+EXTRA_CLEAN += local_config.mk local_config.h
+
+ifeq ($(ANON_COW_EXTRA_LIBS),)
+all: warn_missing_liburing
+
+warn_missing_liburing:
+ @echo ; \
+ echo "Warning: missing liburing support. Some COW tests will be skipped." ; \
+ echo
+endif
diff --git a/tools/testing/selftests/vm/anon_cow.c b/tools/testing/selftests/vm/anon_cow.c
new file mode 100644
index 000000000000..705bd0b3db11
--- /dev/null
+++ b/tools/testing/selftests/vm/anon_cow.c
@@ -0,0 +1,1126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * COW (Copy On Write) tests for anonymous memory.
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <assert.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+
+#include "local_config.h"
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+#include <liburing.h>
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+#include "../../../../mm/gup_test.h"
+#include "../kselftest.h"
+#include "vm_util.h"
+
+static size_t pagesize;
+static int pagemap_fd;
+static size_t thpsize;
+static int nr_hugetlbsizes;
+static size_t hugetlbsizes[10];
+static int gup_fd;
+
+static void detect_thpsize(void)
+{
+ int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size",
+ O_RDONLY);
+ size_t size = 0;
+ char buf[15];
+ int ret;
+
+ if (fd < 0)
+ return;
+
+ ret = pread(fd, buf, sizeof(buf), 0);
+ if (ret > 0 && ret < sizeof(buf)) {
+ buf[ret] = 0;
+
+ size = strtoul(buf, NULL, 10);
+ if (size < pagesize)
+ size = 0;
+ if (size > 0) {
+ thpsize = size;
+ ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
+ thpsize / 1024);
+ }
+ }
+
+ close(fd);
+}
+
+static void detect_hugetlbsizes(void)
+{
+ DIR *dir = opendir("/sys/kernel/mm/hugepages/");
+
+ if (!dir)
+ return;
+
+ while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) {
+ struct dirent *entry = readdir(dir);
+ size_t kb;
+
+ if (!entry)
+ break;
+ if (entry->d_type != DT_DIR)
+ continue;
+ if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1)
+ continue;
+ hugetlbsizes[nr_hugetlbsizes] = kb * 1024;
+ nr_hugetlbsizes++;
+ ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n",
+ kb);
+ }
+ closedir(dir);
+}
+
+static bool range_is_swapped(void *addr, size_t size)
+{
+ for (; size; addr += pagesize, size -= pagesize)
+ if (!pagemap_is_swapped(pagemap_fd, addr))
+ return false;
+ return true;
+}
+
+struct comm_pipes {
+ int child_ready[2];
+ int parent_ready[2];
+};
+
+static int setup_comm_pipes(struct comm_pipes *comm_pipes)
+{
+ if (pipe(comm_pipes->child_ready) < 0)
+ return -errno;
+ if (pipe(comm_pipes->parent_ready) < 0) {
+ close(comm_pipes->child_ready[0]);
+ close(comm_pipes->child_ready[1]);
+ return -errno;
+ }
+
+ return 0;
+}
+
+static void close_comm_pipes(struct comm_pipes *comm_pipes)
+{
+ close(comm_pipes->child_ready[0]);
+ close(comm_pipes->child_ready[1]);
+ close(comm_pipes->parent_ready[0]);
+ close(comm_pipes->parent_ready[1]);
+}
+
+static int child_memcmp_fn(char *mem, size_t size,
+ struct comm_pipes *comm_pipes)
+{
+ char *old = malloc(size);
+ char buf;
+
+ /* Backup the original content. */
+ memcpy(old, mem, size);
+
+ /* Wait until the parent modified the page. */
+ write(comm_pipes->child_ready[1], "0", 1);
+ while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+ ;
+
+ /* See if we still read the old values. */
+ return memcmp(old, mem, size);
+}
+
+static int child_vmsplice_memcmp_fn(char *mem, size_t size,
+ struct comm_pipes *comm_pipes)
+{
+ struct iovec iov = {
+ .iov_base = mem,
+ .iov_len = size,
+ };
+ ssize_t cur, total, transferred;
+ char *old, *new;
+ int fds[2];
+ char buf;
+
+ old = malloc(size);
+ new = malloc(size);
+
+ /* Backup the original content. */
+ memcpy(old, mem, size);
+
+ if (pipe(fds) < 0)
+ return -errno;
+
+ /* Trigger a read-only pin. */
+ transferred = vmsplice(fds[1], &iov, 1, 0);
+ if (transferred < 0)
+ return -errno;
+ if (transferred == 0)
+ return -EINVAL;
+
+ /* Unmap it from our page tables. */
+ if (munmap(mem, size) < 0)
+ return -errno;
+
+ /* Wait until the parent modified it. */
+ write(comm_pipes->child_ready[1], "0", 1);
+ while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+ ;
+
+ /* See if we still read the old values via the pipe. */
+ for (total = 0; total < transferred; total += cur) {
+ cur = read(fds[0], new + total, transferred - total);
+ if (cur < 0)
+ return -errno;
+ }
+
+ return memcmp(old, new, transferred);
+}
+
+typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
+
+static void do_test_cow_in_parent(char *mem, size_t size, child_fn fn)
+{
+ struct comm_pipes comm_pipes;
+ char buf;
+ int ret;
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ ksft_test_result_fail("pipe() failed\n");
+ return;
+ }
+
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto close_comm_pipes;
+ } else if (!ret) {
+ exit(fn(mem, size, &comm_pipes));
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+ /* Modify the page. */
+ memset(mem, 0xff, size);
+ write(comm_pipes.parent_ready[1], "0", 1);
+
+ wait(&ret);
+ if (WIFEXITED(ret))
+ ret = WEXITSTATUS(ret);
+ else
+ ret = -EINVAL;
+
+ ksft_test_result(!ret, "No leak from parent into child\n");
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+}
+
+static void test_cow_in_parent(char *mem, size_t size)
+{
+ do_test_cow_in_parent(mem, size, child_memcmp_fn);
+}
+
+static void test_vmsplice_in_child(char *mem, size_t size)
+{
+ do_test_cow_in_parent(mem, size, child_vmsplice_memcmp_fn);
+}
+
+static void do_test_vmsplice_in_parent(char *mem, size_t size,
+ bool before_fork)
+{
+ struct iovec iov = {
+ .iov_base = mem,
+ .iov_len = size,
+ };
+ ssize_t cur, total, transferred;
+ struct comm_pipes comm_pipes;
+ char *old, *new;
+ int ret, fds[2];
+ char buf;
+
+ old = malloc(size);
+ new = malloc(size);
+
+ memcpy(old, mem, size);
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ ksft_test_result_fail("pipe() failed\n");
+ goto free;
+ }
+
+ if (pipe(fds) < 0) {
+ ksft_test_result_fail("pipe() failed\n");
+ goto close_comm_pipes;
+ }
+
+ if (before_fork) {
+ transferred = vmsplice(fds[1], &iov, 1, 0);
+ if (transferred <= 0) {
+ ksft_test_result_fail("vmsplice() failed\n");
+ goto close_pipe;
+ }
+ }
+
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto close_pipe;
+ } else if (!ret) {
+ write(comm_pipes.child_ready[1], "0", 1);
+ while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+ ;
+ /* Modify page content in the child. */
+ memset(mem, 0xff, size);
+ exit(0);
+ }
+
+ if (!before_fork) {
+ transferred = vmsplice(fds[1], &iov, 1, 0);
+ if (transferred <= 0) {
+ ksft_test_result_fail("vmsplice() failed\n");
+ wait(&ret);
+ goto close_pipe;
+ }
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+ if (munmap(mem, size) < 0) {
+ ksft_test_result_fail("munmap() failed\n");
+ goto close_pipe;
+ }
+ write(comm_pipes.parent_ready[1], "0", 1);
+
+ /* Wait until the child is done writing. */
+ wait(&ret);
+ if (!WIFEXITED(ret)) {
+ ksft_test_result_fail("wait() failed\n");
+ goto close_pipe;
+ }
+
+ /* See if we still read the old values. */
+ for (total = 0; total < transferred; total += cur) {
+ cur = read(fds[0], new + total, transferred - total);
+ if (cur < 0) {
+ ksft_test_result_fail("read() failed\n");
+ goto close_pipe;
+ }
+ }
+
+ ksft_test_result(!memcmp(old, new, transferred),
+ "No leak from child into parent\n");
+close_pipe:
+ close(fds[0]);
+ close(fds[1]);
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+free:
+ free(old);
+ free(new);
+}
+
+static void test_vmsplice_before_fork(char *mem, size_t size)
+{
+ do_test_vmsplice_in_parent(mem, size, true);
+}
+
+static void test_vmsplice_after_fork(char *mem, size_t size)
+{
+ do_test_vmsplice_in_parent(mem, size, false);
+}
+
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+static void do_test_iouring(char *mem, size_t size, bool use_fork)
+{
+ struct comm_pipes comm_pipes;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ struct io_uring ring;
+ ssize_t cur, total;
+ struct iovec iov;
+ char *buf, *tmp;
+ int ret, fd;
+ FILE *file;
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ ksft_test_result_fail("pipe() failed\n");
+ return;
+ }
+
+ file = tmpfile();
+ if (!file) {
+ ksft_test_result_fail("tmpfile() failed\n");
+ goto close_comm_pipes;
+ }
+ fd = fileno(file);
+ assert(fd);
+
+ tmp = malloc(size);
+ if (!tmp) {
+ ksft_test_result_fail("malloc() failed\n");
+ goto close_file;
+ }
+
+ /* Skip on errors, as we might just lack kernel support. */
+ ret = io_uring_queue_init(1, &ring, 0);
+ if (ret < 0) {
+ ksft_test_result_skip("io_uring_queue_init() failed\n");
+ goto free_tmp;
+ }
+
+ /*
+ * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
+ * | FOLL_LONGTERM the range.
+ *
+ * Skip on errors, as we might just lack kernel support or might not
+ * have sufficient MEMLOCK permissions.
+ */
+ iov.iov_base = mem;
+ iov.iov_len = size;
+ ret = io_uring_register_buffers(&ring, &iov, 1);
+ if (ret) {
+ ksft_test_result_skip("io_uring_register_buffers() failed\n");
+ goto queue_exit;
+ }
+
+ if (use_fork) {
+ /*
+ * fork() and keep the child alive until we're done. Note that
+ * we expect the pinned page to not get shared with the child.
+ */
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto unregister_buffers;
+ } else if (!ret) {
+ write(comm_pipes.child_ready[1], "0", 1);
+ while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+ ;
+ exit(0);
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+ } else {
+ /*
+ * Map the page R/O into the page table. Enable softdirty
+ * tracking to stop the page from getting mapped R/W immediately
+ * again by mprotect() optimizations. Note that we don't have an
+ * easy way to test if that worked (the pagemap does not export
+ * if the page is mapped R/O vs. R/W).
+ */
+ ret = mprotect(mem, size, PROT_READ);
+ clear_softdirty();
+ ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto unregister_buffers;
+ }
+ }
+
+ /*
+ * Modify the page and write page content as observed by the fixed
+ * buffer pin to the file so we can verify it.
+ */
+ memset(mem, 0xff, size);
+ sqe = io_uring_get_sqe(&ring);
+ if (!sqe) {
+ ksft_test_result_fail("io_uring_get_sqe() failed\n");
+ goto quit_child;
+ }
+ io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
+
+ ret = io_uring_submit(&ring);
+ if (ret < 0) {
+ ksft_test_result_fail("io_uring_submit() failed\n");
+ goto quit_child;
+ }
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret < 0) {
+ ksft_test_result_fail("io_uring_wait_cqe() failed\n");
+ goto quit_child;
+ }
+
+ if (cqe->res != size) {
+ ksft_test_result_fail("write_fixed failed\n");
+ goto quit_child;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+
+ /* Read back the file content to the temporary buffer. */
+ total = 0;
+ while (total < size) {
+ cur = pread(fd, tmp + total, size - total, total);
+ if (cur < 0) {
+ ksft_test_result_fail("pread() failed\n");
+ goto quit_child;
+ }
+ total += cur;
+ }
+
+ /* Finally, check if we read what we expected. */
+ ksft_test_result(!memcmp(mem, tmp, size),
+ "Longterm R/W pin is reliable\n");
+
+quit_child:
+ if (use_fork) {
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ }
+unregister_buffers:
+ io_uring_unregister_buffers(&ring);
+queue_exit:
+ io_uring_queue_exit(&ring);
+free_tmp:
+ free(tmp);
+close_file:
+ fclose(file);
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+}
+
+static void test_iouring_ro(char *mem, size_t size)
+{
+ do_test_iouring(mem, size, false);
+}
+
+static void test_iouring_fork(char *mem, size_t size)
+{
+ do_test_iouring(mem, size, true);
+}
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+enum ro_pin_test {
+ RO_PIN_TEST_SHARED,
+ RO_PIN_TEST_PREVIOUSLY_SHARED,
+ RO_PIN_TEST_RO_EXCLUSIVE,
+};
+
+static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
+ bool fast)
+{
+ struct pin_longterm_test args;
+ struct comm_pipes comm_pipes;
+ char *tmp, buf;
+ __u64 tmp_val;
+ int ret;
+
+ if (gup_fd < 0) {
+ ksft_test_result_skip("gup_test not available\n");
+ return;
+ }
+
+ tmp = malloc(size);
+ if (!tmp) {
+ ksft_test_result_fail("malloc() failed\n");
+ return;
+ }
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ ksft_test_result_fail("pipe() failed\n");
+ goto free_tmp;
+ }
+
+ switch (test) {
+ case RO_PIN_TEST_SHARED:
+ case RO_PIN_TEST_PREVIOUSLY_SHARED:
+ /*
+ * Share the pages with our child. As the pages are not pinned,
+ * this should just work.
+ */
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto close_comm_pipes;
+ } else if (!ret) {
+ write(comm_pipes.child_ready[1], "0", 1);
+ while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+ ;
+ exit(0);
+ }
+
+ /* Wait until our child is ready. */
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+
+ if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
+ /*
+ * Tell the child to quit now and wait until it quit.
+ * The pages should now be mapped R/O into our page
+ * tables, but they are no longer shared.
+ */
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ if (!WIFEXITED(ret))
+ ksft_print_msg("[INFO] wait() failed\n");
+ }
+ break;
+ case RO_PIN_TEST_RO_EXCLUSIVE:
+ /*
+ * Map the page R/O into the page table. Enable softdirty
+ * tracking to stop the page from getting mapped R/W immediately
+ * again by mprotect() optimizations. Note that we don't have an
+ * easy way to test if that worked (the pagemap does not export
+ * if the page is mapped R/O vs. R/W).
+ */
+ ret = mprotect(mem, size, PROT_READ);
+ clear_softdirty();
+ ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto close_comm_pipes;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ /* Take a R/O pin. This should trigger unsharing. */
+ args.addr = (__u64)mem;
+ args.size = size;
+ args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
+ if (ret) {
+ if (errno == EINVAL)
+ ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
+ else
+ ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
+ goto wait;
+ }
+
+ /* Modify the page. */
+ memset(mem, 0xff, size);
+
+ /*
+ * Read back the content via the pin to the temporary buffer and
+ * test if we observed the modification.
+ */
+ tmp_val = (__u64)tmp;
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
+ if (ret)
+ ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
+ else
+ ksft_test_result(!memcmp(mem, tmp, size),
+ "Longterm R/O pin is reliable\n");
+
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
+ if (ret)
+ ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
+wait:
+ switch (test) {
+ case RO_PIN_TEST_SHARED:
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ if (!WIFEXITED(ret))
+ ksft_print_msg("[INFO] wait() failed\n");
+ break;
+ default:
+ break;
+ }
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+free_tmp:
+ free(tmp);
+}
+
+static void test_ro_pin_on_shared(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_shared(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
+}
+
+static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
+}
+
+typedef void (*test_fn)(char *mem, size_t size);
+
+static void do_run_with_base_page(test_fn fn, bool swapout)
+{
+ char *mem;
+ int ret;
+
+ mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return;
+ }
+
+ ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
+ /* Ignore if not around on a kernel. */
+ if (ret && errno != EINVAL) {
+ ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+ goto munmap;
+ }
+
+ /* Populate a base page. */
+ memset(mem, 0, pagesize);
+
+ if (swapout) {
+ madvise(mem, pagesize, MADV_PAGEOUT);
+ if (!pagemap_is_swapped(pagemap_fd, mem)) {
+ ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+ goto munmap;
+ }
+ }
+
+ fn(mem, pagesize);
+munmap:
+ munmap(mem, pagesize);
+}
+
+static void run_with_base_page(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with base page\n", desc);
+ do_run_with_base_page(fn, false);
+}
+
+static void run_with_base_page_swap(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
+ do_run_with_base_page(fn, true);
+}
+
+enum thp_run {
+ THP_RUN_PMD,
+ THP_RUN_PMD_SWAPOUT,
+ THP_RUN_PTE,
+ THP_RUN_PTE_SWAPOUT,
+ THP_RUN_SINGLE_PTE,
+ THP_RUN_SINGLE_PTE_SWAPOUT,
+ THP_RUN_PARTIAL_MREMAP,
+ THP_RUN_PARTIAL_SHARED,
+};
+
+static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
+{
+ char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
+ size_t size, mmap_size, mremap_size;
+ int ret;
+
+ /* For alignment purposes, we need twice the thp size. */
+ mmap_size = 2 * thpsize;
+ mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mmap_mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return;
+ }
+
+ /* We need a THP-aligned memory area. */
+ mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
+
+ ret = madvise(mem, thpsize, MADV_HUGEPAGE);
+ if (ret) {
+ ksft_test_result_fail("MADV_HUGEPAGE failed\n");
+ goto munmap;
+ }
+
+ /*
+ * Try to populate a THP. Touch the first sub-page and test if we get
+ * another sub-page populated automatically.
+ */
+ mem[0] = 0;
+ if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
+ ksft_test_result_skip("Did not get a THP populated\n");
+ goto munmap;
+ }
+ memset(mem, 0, thpsize);
+
+ size = thpsize;
+ switch (thp_run) {
+ case THP_RUN_PMD:
+ case THP_RUN_PMD_SWAPOUT:
+ break;
+ case THP_RUN_PTE:
+ case THP_RUN_PTE_SWAPOUT:
+ /*
+ * Trigger PTE-mapping the THP by temporarily mapping a single
+ * subpage R/O.
+ */
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+ break;
+ case THP_RUN_SINGLE_PTE:
+ case THP_RUN_SINGLE_PTE_SWAPOUT:
+ /*
+ * Discard all but a single subpage of that PTE-mapped THP. What
+ * remains is a single PTE mapping a single subpage.
+ */
+ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
+ if (ret) {
+ ksft_test_result_fail("MADV_DONTNEED failed\n");
+ goto munmap;
+ }
+ size = pagesize;
+ break;
+ case THP_RUN_PARTIAL_MREMAP:
+ /*
+ * Remap half of the THP. We need some new memory location
+ * for that.
+ */
+ mremap_size = thpsize / 2;
+ mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ goto munmap;
+ }
+ tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
+ if (tmp != mremap_mem) {
+ ksft_test_result_fail("mremap() failed\n");
+ goto munmap;
+ }
+ size = mremap_size;
+ break;
+ case THP_RUN_PARTIAL_SHARED:
+ /*
+ * Share the first page of the THP with a child and quit the
+ * child. This will result in some parts of the THP never
+ * have been shared.
+ */
+ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
+ if (ret) {
+ ksft_test_result_fail("MADV_DONTFORK failed\n");
+ goto munmap;
+ }
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto munmap;
+ } else if (!ret) {
+ exit(0);
+ }
+ wait(&ret);
+ /* Allow for sharing all pages again. */
+ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
+ if (ret) {
+ ksft_test_result_fail("MADV_DOFORK failed\n");
+ goto munmap;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ switch (thp_run) {
+ case THP_RUN_PMD_SWAPOUT:
+ case THP_RUN_PTE_SWAPOUT:
+ case THP_RUN_SINGLE_PTE_SWAPOUT:
+ madvise(mem, size, MADV_PAGEOUT);
+ if (!range_is_swapped(mem, size)) {
+ ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+ goto munmap;
+ }
+ break;
+ default:
+ break;
+ }
+
+ fn(mem, size);
+munmap:
+ munmap(mmap_mem, mmap_size);
+ if (mremap_mem != MAP_FAILED)
+ munmap(mremap_mem, mremap_size);
+}
+
+static void run_with_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PMD);
+}
+
+static void run_with_thp_swap(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
+}
+
+static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PTE);
+}
+
+static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
+}
+
+static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
+}
+
+static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
+}
+
+static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
+}
+
+static void run_with_partial_shared_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
+}
+
+static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
+{
+ int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
+ char *mem, *dummy;
+
+ ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
+ hugetlbsize / 1024);
+
+ flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
+
+ mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_skip("need more free huge pages\n");
+ return;
+ }
+
+ /* Populate an huge page. */
+ memset(mem, 0, hugetlbsize);
+
+ /*
+ * We need a total of two hugetlb pages to handle COW/unsharing
+ * properly, otherwise we might get zapped by a SIGBUS.
+ */
+ dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (dummy == MAP_FAILED) {
+ ksft_test_result_skip("need more free huge pages\n");
+ goto munmap;
+ }
+ munmap(dummy, hugetlbsize);
+
+ fn(mem, hugetlbsize);
+munmap:
+ munmap(mem, hugetlbsize);
+}
+
+struct test_case {
+ const char *desc;
+ test_fn fn;
+};
+
+static const struct test_case test_cases[] = {
+ /*
+ * Basic COW tests for fork() without any GUP. If we miss to break COW,
+ * either the child can observe modifications by the parent or the
+ * other way around.
+ */
+ {
+ "Basic COW after fork()",
+ test_cow_in_parent,
+ },
+ /*
+ * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
+ * we miss to break COW, the child observes modifications by the parent.
+ * This is CVE-2020-29374 reported by Jann Horn.
+ */
+ {
+ "vmsplice() + unmap in child",
+ test_vmsplice_in_child
+ },
+ /*
+ * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
+ * fork(); modify in the child. If we miss to break COW, the parent
+ * observes modifications by the child.
+ */
+ {
+ "vmsplice() before fork(), unmap in parent after fork()",
+ test_vmsplice_before_fork,
+ },
+ /*
+ * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
+ * child. If we miss to break COW, the parent observes modifications by
+ * the child.
+ */
+ {
+ "vmsplice() + unmap in parent after fork()",
+ test_vmsplice_after_fork,
+ },
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+ /*
+ * Take a R/W longterm pin and then map the page R/O into the page
+ * table to trigger a write fault on next access. When modifying the
+ * page, the page content must be visible via the pin.
+ */
+ {
+ "R/O-mapping a page registered as iouring fixed buffer",
+ test_iouring_ro,
+ },
+ /*
+ * Take a R/W longterm pin and then fork() a child. When modifying the
+ * page, the page content must be visible via the pin. We expect the
+ * pinned page to not get shared with the child.
+ */
+ {
+ "fork() with an iouring fixed buffer",
+ test_iouring_fork,
+ },
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+ /*
+ * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
+ * When modifying the page via the page table, the page content change
+ * must be visible via the pin.
+ */
+ {
+ "R/O GUP pin on R/O-mapped shared page",
+ test_ro_pin_on_shared,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O GUP-fast pin on R/O-mapped shared page",
+ test_ro_fast_pin_on_shared,
+ },
+ /*
+ * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
+ * was previously shared. When modifying the page via the page table,
+ * the page content change must be visible via the pin.
+ */
+ {
+ "R/O GUP pin on R/O-mapped previously-shared page",
+ test_ro_pin_on_ro_previously_shared,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O GUP-fast pin on R/O-mapped previously-shared page",
+ test_ro_fast_pin_on_ro_previously_shared,
+ },
+ /*
+ * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
+ * When modifying the page via the page table, the page content change
+ * must be visible via the pin.
+ */
+ {
+ "R/O GUP pin on R/O-mapped exclusive page",
+ test_ro_pin_on_ro_exclusive,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O GUP-fast pin on R/O-mapped exclusive page",
+ test_ro_fast_pin_on_ro_exclusive,
+ },
+};
+
+static void run_test_case(struct test_case const *test_case)
+{
+ int i;
+
+ run_with_base_page(test_case->fn, test_case->desc);
+ run_with_base_page_swap(test_case->fn, test_case->desc);
+ if (thpsize) {
+ run_with_thp(test_case->fn, test_case->desc);
+ run_with_thp_swap(test_case->fn, test_case->desc);
+ run_with_pte_mapped_thp(test_case->fn, test_case->desc);
+ run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
+ run_with_single_pte_of_thp(test_case->fn, test_case->desc);
+ run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
+ run_with_partial_mremap_thp(test_case->fn, test_case->desc);
+ run_with_partial_shared_thp(test_case->fn, test_case->desc);
+ }
+ for (i = 0; i < nr_hugetlbsizes; i++)
+ run_with_hugetlb(test_case->fn, test_case->desc,
+ hugetlbsizes[i]);
+}
+
+static void run_test_cases(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_cases); i++)
+ run_test_case(&test_cases[i]);
+}
+
+static int tests_per_test_case(void)
+{
+ int tests = 2 + nr_hugetlbsizes;
+
+ if (thpsize)
+ tests += 8;
+ return tests;
+}
+
+int main(int argc, char **argv)
+{
+ int nr_test_cases = ARRAY_SIZE(test_cases);
+ int err;
+
+ pagesize = getpagesize();
+ detect_thpsize();
+ detect_hugetlbsizes();
+
+ ksft_print_header();
+ ksft_set_plan(nr_test_cases * tests_per_test_case());
+
+ gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+
+ run_test_cases();
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh
new file mode 100644
index 000000000000..9a44c6520925
--- /dev/null
+++ b/tools/testing/selftests/vm/check_config.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Probe for libraries and create header files to record the results. Both C
+# header files and Makefile include fragments are created.
+
+OUTPUT_H_FILE=local_config.h
+OUTPUT_MKFILE=local_config.mk
+
+tmpname=$(mktemp)
+tmpfile_c=${tmpname}.c
+tmpfile_o=${tmpname}.o
+
+# liburing
+echo "#include <sys/types.h>" > $tmpfile_c
+echo "#include <liburing.h>" >> $tmpfile_c
+echo "int func(void) { return 0; }" >> $tmpfile_c
+
+CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
+$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
+
+if [ -f $tmpfile_o ]; then
+ echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE
+ echo "ANON_COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE
+else
+ echo "// No liburing support found" > $OUTPUT_H_FILE
+ echo "# No liburing support found, so:" > $OUTPUT_MKFILE
+ echo "ANON_COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE
+fi
+
+rm ${tmpname}.*
diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c
index e63a0214f639..e53b5eaa8fce 100644
--- a/tools/testing/selftests/vm/hugepage-mremap.c
+++ b/tools/testing/selftests/vm/hugepage-mremap.c
@@ -22,6 +22,7 @@
#include <sys/syscall.h> /* Definition of SYS_* constants */
#include <linux/userfaultfd.h>
#include <sys/ioctl.h>
+#include <string.h>
#define DEFAULT_LENGTH_MB 10UL
#define MB_TO_BYTES(x) (x * 1024 * 1024)
@@ -108,26 +109,23 @@ static void register_region_with_uffd(char *addr, size_t len)
int main(int argc, char *argv[])
{
size_t length = 0;
+ int ret = 0, fd;
- if (argc != 2 && argc != 3) {
- printf("Usage: %s [length_in_MB] <hugetlb_file>\n", argv[0]);
+ if (argc >= 2 && !strcmp(argv[1], "-h")) {
+ printf("Usage: %s [length_in_MB]\n", argv[0]);
exit(1);
}
/* Read memory length as the first arg if valid, otherwise fallback to
* the default length.
*/
- if (argc == 3)
- length = argc > 2 ? (size_t)atoi(argv[1]) : 0UL;
+ if (argc >= 2)
+ length = (size_t)atoi(argv[1]);
+ else
+ length = DEFAULT_LENGTH_MB;
- length = length > 0 ? length : DEFAULT_LENGTH_MB;
length = MB_TO_BYTES(length);
-
- int ret = 0;
-
- /* last arg is the hugetlb file name */
- int fd = open(argv[argc-1], O_CREAT | O_RDWR, 0755);
-
+ fd = memfd_create(argv[0], MFD_HUGETLB);
if (fd < 0) {
perror("Open failed");
exit(1);
@@ -185,7 +183,6 @@ int main(int argc, char *argv[])
}
close(fd);
- unlink(argv[argc-1]);
return ret;
}
diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c
index 3c9943131881..f96435b70986 100644
--- a/tools/testing/selftests/vm/hugetlb-madvise.c
+++ b/tools/testing/selftests/vm/hugetlb-madvise.c
@@ -12,6 +12,7 @@
* directory.
*/
+#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
@@ -19,7 +20,6 @@
#define __USE_GNU
#include <fcntl.h>
-#define USAGE "USAGE: %s <hugepagefile_name>\n"
#define MIN_FREE_PAGES 20
#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */
@@ -103,11 +103,6 @@ int main(int argc, char **argv)
int fd;
int ret;
- if (argc != 2) {
- printf(USAGE, argv[0]);
- exit(1);
- }
-
huge_page_size = default_huge_page_size();
if (!huge_page_size) {
printf("Unable to determine huge page size, exiting!\n");
@@ -125,9 +120,9 @@ int main(int argc, char **argv)
exit(1);
}
- fd = open(argv[1], O_CREAT | O_RDWR, 0755);
+ fd = memfd_create(argv[0], MFD_HUGETLB);
if (fd < 0) {
- perror("Open failed");
+ perror("memfd_create() failed");
exit(1);
}
@@ -406,6 +401,5 @@ int main(int argc, char **argv)
(void)munmap(addr2, NR_HUGE_PAGES * huge_page_size);
close(fd);
- unlink(argv[1]);
return 0;
}
diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c
index 715a42e8e2cd..60547245e479 100644
--- a/tools/testing/selftests/vm/madv_populate.c
+++ b/tools/testing/selftests/vm/madv_populate.c
@@ -27,14 +27,6 @@
static size_t pagesize;
-static bool pagemap_is_populated(int fd, char *start)
-{
- uint64_t entry = pagemap_get_entry(fd, start);
-
- /* Present or swapped. */
- return entry & 0xc000000000000000ull;
-}
-
static void sense_support(void)
{
char *addr;
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index e780e76c26b8..1fa783732296 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -5,7 +5,6 @@
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
-mnt=./huge
exitcode=0
#get huge pagesize and freepages from /proc/meminfo
@@ -84,9 +83,6 @@ run_test() {
fi
}
-mkdir "$mnt"
-mount -t hugetlbfs none "$mnt"
-
run_test ./hugepage-mmap
shmmax=$(cat /proc/sys/kernel/shmmax)
@@ -98,14 +94,9 @@ echo "$shmmax" > /proc/sys/kernel/shmmax
echo "$shmall" > /proc/sys/kernel/shmall
run_test ./map_hugetlb
-
-run_test ./hugepage-mremap "$mnt"/huge_mremap
-rm -f "$mnt"/huge_mremap
-
+run_test ./hugepage-mremap
run_test ./hugepage-vmemmap
-
-run_test ./hugetlb-madvise "$mnt"/madvise-test
-rm -f "$mnt"/madvise-test
+run_test ./hugetlb-madvise
echo "NOTE: The above hugetlb tests provide minimal coverage. Use"
echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
@@ -126,14 +117,11 @@ for mod in "${uffd_mods[@]}"; do
# Hugetlb tests require source and destination huge pages. Pass in half
# the size ($half_ufd_size_MB), which is used for *each*.
run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
- run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 "$mnt"/uffd-test
- rm -f "$mnt"/uffd-test
+ run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32
run_test ./userfaultfd shmem${mod} 20 16
done
#cleanup
-umount "$mnt"
-rm -rf "$mnt"
echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
run_test ./compaction_test
@@ -197,4 +185,7 @@ fi
run_test ./soft-dirty
+# COW tests for anonymous memory
+run_test ./anon_cow
+
exit $exitcode
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 297f250c1d95..7f22844ed704 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -93,10 +93,8 @@ static volatile bool test_uffdio_zeropage_eexist = true;
static bool test_uffdio_wp = true;
/* Whether to test uffd minor faults */
static bool test_uffdio_minor = false;
-
static bool map_shared;
-static int shm_fd;
-static int huge_fd;
+static int mem_fd;
static unsigned long long *count_verify;
static int uffd = -1;
static int uffd_flags, finished, *pipefd;
@@ -143,7 +141,7 @@ const char *examples =
"# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
"./userfaultfd hugetlb 256 50\n\n"
"# Run the same hugetlb test but using shared file:\n"
- "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
+ "./userfaultfd hugetlb_shared 256 50\n\n"
"# 10MiB-~6GiB 999 bounces anonymous test, "
"continue forever unless an error triggers\n"
"while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
@@ -260,35 +258,21 @@ static void hugetlb_release_pages(char *rel_area)
static void hugetlb_allocate_area(void **alloc_area, bool is_src)
{
+ off_t size = nr_pages * page_size;
+ off_t offset = is_src ? 0 : size;
void *area_alias = NULL;
char **alloc_area_alias;
- if (!map_shared)
- *alloc_area = mmap(NULL,
- nr_pages * page_size,
- PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB |
- (is_src ? 0 : MAP_NORESERVE),
- -1,
- 0);
- else
- *alloc_area = mmap(NULL,
- nr_pages * page_size,
- PROT_READ | PROT_WRITE,
- MAP_SHARED |
- (is_src ? 0 : MAP_NORESERVE),
- huge_fd,
- is_src ? 0 : nr_pages * page_size);
+ *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+ (is_src ? 0 : MAP_NORESERVE),
+ mem_fd, offset);
if (*alloc_area == MAP_FAILED)
err("mmap of hugetlbfs file failed");
if (map_shared) {
- area_alias = mmap(NULL,
- nr_pages * page_size,
- PROT_READ | PROT_WRITE,
- MAP_SHARED,
- huge_fd,
- is_src ? 0 : nr_pages * page_size);
+ area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, mem_fd, offset);
if (area_alias == MAP_FAILED)
err("mmap of hugetlb file alias failed");
}
@@ -334,14 +318,14 @@ static void shmem_allocate_area(void **alloc_area, bool is_src)
}
*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
- shm_fd, offset);
+ mem_fd, offset);
if (*alloc_area == MAP_FAILED)
err("mmap of memfd failed");
if (test_collapse && *alloc_area != p)
err("mmap of memfd failed at %p", p);
area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
- shm_fd, offset);
+ mem_fd, offset);
if (area_alias == MAP_FAILED)
err("mmap of memfd alias failed");
if (test_collapse && area_alias != p_alias)
@@ -1841,21 +1825,17 @@ int main(int argc, char **argv)
}
nr_pages = nr_pages_per_cpu * nr_cpus;
- if (test_type == TEST_HUGETLB && map_shared) {
- if (argc < 5)
- usage();
- huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
- if (huge_fd < 0)
- err("Open of %s failed", argv[4]);
- if (ftruncate(huge_fd, 0))
- err("ftruncate %s to size 0 failed", argv[4]);
- } else if (test_type == TEST_SHMEM) {
- shm_fd = memfd_create(argv[0], 0);
- if (shm_fd < 0)
+ if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) {
+ unsigned int memfd_flags = 0;
+
+ if (test_type == TEST_HUGETLB)
+ memfd_flags = MFD_HUGETLB;
+ mem_fd = memfd_create(argv[0], memfd_flags);
+ if (mem_fd < 0)
err("memfd_create");
- if (ftruncate(shm_fd, nr_pages * page_size * 2))
+ if (ftruncate(mem_fd, nr_pages * page_size * 2))
err("ftruncate");
- if (fallocate(shm_fd,
+ if (fallocate(mem_fd,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
nr_pages * page_size * 2))
err("fallocate");
diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c
index f11f8adda521..5bbf7641a0f0 100644
--- a/tools/testing/selftests/vm/vm_util.c
+++ b/tools/testing/selftests/vm/vm_util.c
@@ -28,6 +28,21 @@ bool pagemap_is_softdirty(int fd, char *start)
return entry & 0x0080000000000000ull;
}
+bool pagemap_is_swapped(int fd, char *start)
+{
+ uint64_t entry = pagemap_get_entry(fd, start);
+
+ return entry & 0x4000000000000000ull;
+}
+
+bool pagemap_is_populated(int fd, char *start)
+{
+ uint64_t entry = pagemap_get_entry(fd, start);
+
+ /* Present or swapped. */
+ return entry & 0xc000000000000000ull;
+}
+
void clear_softdirty(void)
{
int ret;
diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h
index 5c35de454e08..80d5a6ad413b 100644
--- a/tools/testing/selftests/vm/vm_util.h
+++ b/tools/testing/selftests/vm/vm_util.h
@@ -4,6 +4,8 @@
uint64_t pagemap_get_entry(int fd, char *start);
bool pagemap_is_softdirty(int fd, char *start);
+bool pagemap_is_swapped(int fd, char *start);
+bool pagemap_is_populated(int fd, char *start);
void clear_softdirty(void);
bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
uint64_t read_pmd_pagesize(void);