From af7df1c986adfe0684cb0b91114e0ac442b942e9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sun, 26 Feb 2023 16:16:33 +0300 Subject: selftests: cgroup: add 'malloc' failures checks in test_memcontrol There are several 'malloc' calls in test_memcontrol, which can be unsuccessful. This patch will add 'malloc' failures checking to give more details about test's fail reasons and avoid possible undefined behavior during the future null dereference (like the one in alloc_anon_50M_check_swap function). Link: https://lkml.kernel.org/r/20230226131634.34366-1-ivan.orlov0322@gmail.com Signed-off-by: Ivan Orlov Reviewed-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shuah Khan Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 1e616a8c6a9c..f4f7c0aef702 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -98,6 +98,11 @@ static int alloc_anon_50M_check(const char *cgroup, void *arg) int ret = -1; buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return -1; + } + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; @@ -211,6 +216,11 @@ static int alloc_anon_noexit(const char *cgroup, void *arg) char *buf, *ptr; buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return -1; + } + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; @@ -778,6 +788,11 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) int ret = -1; buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return -1; + } + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; -- cgit v1.2.3 From dd63bd7df41a8f9393a2e3ff9157a441c08eb996 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 6 Mar 2023 11:09:07 -0500 Subject: selftests/mm: fix split huge page tests Fix two inputs to check_anon_huge() and one if condition, so the tests work as expected. Link: https://lkml.kernel.org/r/20230306160907.16804-1-zi.yan@sent.com Fixes: c07c343cda8e ("selftests/vm: dedup THP helpers") Signed-off-by: Zi Yan Reviewed-by: Zach O'Keefe Tested-by: Zach O'Keefe Acked-by: David Hildenbrand Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/split_huge_page_test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 76e1c36dd9e5..b8558c7f1a39 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -106,7 +106,7 @@ void split_pmd_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - if (!check_huge_anon(one_page, 1, pmd_pagesize)) { + if (!check_huge_anon(one_page, 4, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } @@ -122,7 +122,7 @@ void split_pmd_thp(void) } - if (check_huge_anon(one_page, 0, pmd_pagesize)) { + if (!check_huge_anon(one_page, 0, pmd_pagesize)) { printf("Still AnonHugePages not split\n"); exit(EXIT_FAILURE); } @@ -169,7 +169,7 @@ void split_pte_mapped_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - if (!check_huge_anon(one_page, 1, pmd_pagesize)) { + if (!check_huge_anon(one_page, 4, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } -- cgit v1.2.3 From 47fba2b6d5bae89df5b1434bfdad4a56c7e88059 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Mar 2023 17:37:11 -0500 Subject: selftests/mm: smoke test UFFD_FEATURE_WP_UNPOPULATED Enable it by default on the stress test, and add some smoke tests for the pte markers on anonymous. Link: https://lkml.kernel.org/r/20230309223711.823547-3-peterx@redhat.com Signed-off-by: Peter Xu Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Mike Rapoport Cc: Muhammad Usama Anjum Cc: Nadav Amit Cc: Paul Gofman Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/userfaultfd.c | 45 ++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index 7f22844ed704..e030d63c031a 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -1444,6 +1444,43 @@ static int pagemap_test_fork(bool present) return result; } +static void userfaultfd_wp_unpopulated_test(int pagemap_fd) +{ + uint64_t value; + + /* Test applying pte marker to anon unpopulated */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + + /* Test unprotect on anon pte marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Test zap on anon marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Test fault in after marker removed */ + *area_dst = 1; + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + /* Drop it to make pte none again */ + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + + /* Test read-zero-page upon pte marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + *(volatile char *)area_dst; + /* Drop it to make pte none again */ + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); +} + static void userfaultfd_pagemap_test(unsigned int test_pgsize) { struct uffdio_register uffdio_register; @@ -1462,7 +1499,7 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) /* Flush so it doesn't flush twice in parent/child later */ fflush(stdout); - uffd_test_ctx_init(0); + uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); if (test_pgsize > page_size) { /* This is a thp test */ @@ -1482,6 +1519,10 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) pagemap_fd = pagemap_open(); + /* Smoke test WP_UNPOPULATED first when it's still empty */ + if (test_pgsize == page_size) + userfaultfd_wp_unpopulated_test(pagemap_fd); + /* Touch the page */ *area_dst = 1; wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); @@ -1526,7 +1567,7 @@ static int userfaultfd_stress(void) struct uffdio_register uffdio_register; struct uffd_stats uffd_stats[nr_cpus]; - uffd_test_ctx_init(0); + uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); if (posix_memalign(&area, page_size, page_size)) err("out of memory"); -- cgit v1.2.3 From 23baf831a32c04f9a968812511540b1b3e648bf5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Mar 2023 14:31:33 +0300 Subject: mm, treewide: redefine MAX_ORDER sanely MAX_ORDER currently defined as number of orders page allocator supports: user can ask buddy allocator for page order between 0 and MAX_ORDER-1. This definition is counter-intuitive and lead to number of bugs all over the kernel. Change the definition of MAX_ORDER to be inclusive: the range of orders user can ask from buddy allocator is 0..MAX_ORDER now. [kirill@shutemov.name: fix min() warning] Link: https://lkml.kernel.org/r/20230315153800.32wib3n5rickolvh@box [akpm@linux-foundation.org: fix another min_t warning] [kirill@shutemov.name: fixups per Zi Yan] Link: https://lkml.kernel.org/r/20230316232144.b7ic4cif4kjiabws@box.shutemov.name [akpm@linux-foundation.org: fix underlining in docs] Link: https://lore.kernel.org/oe-kbuild-all/202303191025.VRCTk6mP-lkp@intel.com/ Link: https://lkml.kernel.org/r/20230315113133.11326-11-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Michael Ellerman [powerpc] Cc: "Kirill A. Shutemov" Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 6 ++-- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/arc/Kconfig | 4 +-- arch/arm/Kconfig | 9 ++---- arch/arm/configs/imx_v6_v7_defconfig | 2 +- arch/arm/configs/milbeaut_m10v_defconfig | 2 +- arch/arm/configs/oxnas_v6_defconfig | 2 +- arch/arm/configs/pxa_defconfig | 2 +- arch/arm/configs/sama7_defconfig | 2 +- arch/arm/configs/sp7021_defconfig | 2 +- arch/arm64/Kconfig | 27 ++++++++---------- arch/arm64/include/asm/sparsemem.h | 2 +- arch/arm64/kvm/hyp/include/nvhe/gfp.h | 2 +- arch/arm64/kvm/hyp/nvhe/page_alloc.c | 10 +++---- arch/csky/Kconfig | 2 +- arch/ia64/Kconfig | 8 +++--- arch/ia64/include/asm/sparsemem.h | 4 +-- arch/ia64/mm/hugetlbpage.c | 2 +- arch/loongarch/Kconfig | 15 ++++------ arch/m68k/Kconfig.cpu | 5 +--- arch/mips/Kconfig | 19 ++++++------- arch/nios2/Kconfig | 7 ++--- arch/powerpc/Kconfig | 27 ++++++++---------- arch/powerpc/configs/85xx/ge_imp3a_defconfig | 2 +- arch/powerpc/configs/fsl-emb-nonhw.config | 2 +- arch/powerpc/mm/book3s64/iommu_api.c | 2 +- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/sh/configs/ecovec24_defconfig | 2 +- arch/sh/mm/Kconfig | 17 +++++------ arch/sparc/Kconfig | 5 +--- arch/sparc/kernel/pci_sun4v.c | 2 +- arch/sparc/kernel/traps_64.c | 2 +- arch/sparc/mm/tsb.c | 4 +-- arch/um/kernel/um_arch.c | 4 +-- arch/xtensa/Kconfig | 5 +--- drivers/base/regmap/regmap-debugfs.c | 8 +++--- drivers/block/floppy.c | 2 +- drivers/crypto/ccp/sev-dev.c | 2 +- drivers/crypto/hisilicon/sgl.c | 6 ++-- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/i915/gem/selftests/huge_pages.c | 2 +- drivers/gpu/drm/ttm/ttm_pool.c | 22 +++++++------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 +- drivers/iommu/dma-iommu.c | 2 +- drivers/irqchip/irq-gic-v3-its.c | 4 +-- drivers/md/dm-bufio.c | 2 +- drivers/misc/genwqe/card_dev.c | 2 +- drivers/misc/genwqe/card_utils.c | 4 +-- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- drivers/net/ethernet/ibm/ibmvnic.h | 2 +- drivers/video/fbdev/hyperv_fb.c | 4 +-- drivers/video/fbdev/vermilion/vermilion.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- drivers/virtio/virtio_mem.c | 12 ++++---- fs/ramfs/file-nommu.c | 2 +- include/drm/ttm/ttm_pool.h | 2 +- include/linux/hugetlb.h | 2 +- include/linux/mmzone.h | 10 +++---- include/linux/pageblock-flags.h | 4 +-- include/linux/slab.h | 6 ++-- kernel/crash_core.c | 2 +- kernel/dma/pool.c | 6 ++-- kernel/events/ring_buffer.c | 6 ++-- mm/Kconfig | 10 +++---- mm/compaction.c | 8 +++--- mm/debug_vm_pgtable.c | 4 +-- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 +-- mm/kmsan/init.c | 6 ++-- mm/memblock.c | 2 +- mm/memory_hotplug.c | 4 +-- mm/page_alloc.c | 38 ++++++++++++------------- mm/page_isolation.c | 12 ++++---- mm/page_owner.c | 6 ++-- mm/page_reporting.c | 6 ++-- mm/shuffle.h | 2 +- mm/slab.c | 2 +- mm/slub.c | 6 ++-- mm/vmscan.c | 2 +- mm/vmstat.c | 14 ++++----- net/smc/smc_ib.c | 2 +- security/integrity/ima/ima_crypto.c | 2 +- tools/testing/memblock/linux/mmzone.h | 6 ++-- 84 files changed, 223 insertions(+), 253 deletions(-) (limited to 'tools/testing') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 86fd88492870..c18d94fa6470 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -172,7 +172,7 @@ variables. Offset of the free_list's member. This value is used to compute the number of free pages. -Each zone has a free_area structure array called free_area[MAX_ORDER]. +Each zone has a free_area structure array called free_area[MAX_ORDER + 1]. The free_list represents a linked list of free page blocks. (list_head, next|prev) @@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific information. Makedumpfile gets the start address of the vmalloc region from this. -(zone.free_area, MAX_ORDER) ---------------------------- +(zone.free_area, MAX_ORDER + 1) +------------------------------- Free areas descriptor. User-space tools use this value to iterate the free_area ranges. MAX_ORDER is used by the zone buddy allocator. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6221a1d057dd..50da4f26fad5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3969,7 +3969,7 @@ [KNL] Minimal page reporting order Format: Adjust the minimal page reporting order. The page - reporting is disabled when it exceeds (MAX_ORDER-1). + reporting is disabled when it exceeds MAX_ORDER. panic= [KNL] Kernel behaviour on panic: delay timeout > 0: seconds before rebooting diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index d9a13ccf89a3..ab6d701365bb 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -556,7 +556,7 @@ endmenu # "ARC Architecture Configuration" config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "12" if ARC_HUGEPAGE_16M - default "11" + default "11" if ARC_HUGEPAGE_16M + default "10" source "kernel/power/Kconfig" diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e24a9820e12f..929e646e84b9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1355,9 +1355,9 @@ config ARM_MODULE_PLTS config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "12" if SOC_AM33XX - default "9" if SA1111 - default "11" + default "11" if SOC_AM33XX + default "8" if SA1111 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -1366,9 +1366,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - config ALIGNMENT_TRAP def_bool CPU_CP15_MMU select HAVE_PROC_CPU if PROC_FS diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index 6dc6fed12af8..345a67e67dbd 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -31,7 +31,7 @@ CONFIG_SOC_VF610=y CONFIG_SMP=y CONFIG_ARM_PSCI=y CONFIG_HIGHMEM=y -CONFIG_ARCH_FORCE_MAX_ORDER=14 +CONFIG_ARCH_FORCE_MAX_ORDER=13 CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" CONFIG_KEXEC=y CONFIG_CPU_FREQ=y diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig index bd29e5012cb0..385ad0f391a8 100644 --- a/arch/arm/configs/milbeaut_m10v_defconfig +++ b/arch/arm/configs/milbeaut_m10v_defconfig @@ -26,7 +26,7 @@ CONFIG_THUMB2_KERNEL=y # CONFIG_THUMB2_AVOID_R_ARM_THM_JUMP11 is not set # CONFIG_ARM_PATCH_IDIV is not set CONFIG_HIGHMEM=y -CONFIG_ARCH_FORCE_MAX_ORDER=12 +CONFIG_ARCH_FORCE_MAX_ORDER=11 CONFIG_SECCOMP=y CONFIG_KEXEC=y CONFIG_EFI=y diff --git a/arch/arm/configs/oxnas_v6_defconfig b/arch/arm/configs/oxnas_v6_defconfig index 70a67b3fc91b..90779812c6dd 100644 --- a/arch/arm/configs/oxnas_v6_defconfig +++ b/arch/arm/configs/oxnas_v6_defconfig @@ -12,7 +12,7 @@ CONFIG_ARCH_OXNAS=y CONFIG_MACH_OX820=y CONFIG_SMP=y CONFIG_NR_CPUS=16 -CONFIG_ARCH_FORCE_MAX_ORDER=12 +CONFIG_ARCH_FORCE_MAX_ORDER=11 CONFIG_SECCOMP=y CONFIG_ARM_APPENDED_DTB=y CONFIG_ARM_ATAG_DTB_COMPAT=y diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index e656d3af2266..b46e39369dbb 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -20,7 +20,7 @@ CONFIG_PXA_SHARPSL=y CONFIG_MACH_AKITA=y CONFIG_MACH_BORZOI=y CONFIG_AEABI=y -CONFIG_ARCH_FORCE_MAX_ORDER=9 +CONFIG_ARCH_FORCE_MAX_ORDER=8 CONFIG_CMDLINE="root=/dev/ram0 ro" CONFIG_KEXEC=y CONFIG_CPU_FREQ=y diff --git a/arch/arm/configs/sama7_defconfig b/arch/arm/configs/sama7_defconfig index 0d964c613d71..954112041403 100644 --- a/arch/arm/configs/sama7_defconfig +++ b/arch/arm/configs/sama7_defconfig @@ -19,7 +19,7 @@ CONFIG_ATMEL_CLOCKSOURCE_TCB=y # CONFIG_CACHE_L2X0 is not set # CONFIG_ARM_PATCH_IDIV is not set # CONFIG_CPU_SW_DOMAIN_PAN is not set -CONFIG_ARCH_FORCE_MAX_ORDER=15 +CONFIG_ARCH_FORCE_MAX_ORDER=14 CONFIG_UACCESS_WITH_MEMCPY=y # CONFIG_ATAGS is not set CONFIG_CMDLINE="console=ttyS0,115200 earlyprintk ignore_loglevel" diff --git a/arch/arm/configs/sp7021_defconfig b/arch/arm/configs/sp7021_defconfig index 5bca2eb59b86..c6448ac860b6 100644 --- a/arch/arm/configs/sp7021_defconfig +++ b/arch/arm/configs/sp7021_defconfig @@ -17,7 +17,7 @@ CONFIG_ARCH_SUNPLUS=y # CONFIG_VDSO is not set CONFIG_SMP=y CONFIG_THUMB2_KERNEL=y -CONFIG_ARCH_FORCE_MAX_ORDER=12 +CONFIG_ARCH_FORCE_MAX_ORDER=11 CONFIG_VFP=y CONFIG_NEON=y CONFIG_MODULES=y diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1023e896d46b..cb5c6aa3254e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1476,22 +1476,22 @@ config XEN # include/linux/mmzone.h requires the following to be true: # -# MAX_ORDER - 1 + PAGE_SHIFT <= SECTION_SIZE_BITS +# MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS # -# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS + 1 - PAGE_SHIFT: +# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT: # # | SECTION_SIZE_BITS | PAGE_SHIFT | max MAX_ORDER | default MAX_ORDER | # ----+-------------------+--------------+-----------------+--------------------+ -# 4K | 27 | 12 | 16 | 11 | -# 16K | 27 | 14 | 14 | 12 | -# 64K | 29 | 16 | 14 | 14 | +# 4K | 27 | 12 | 15 | 10 | +# 16K | 27 | 14 | 13 | 11 | +# 64K | 29 | 16 | 13 | 13 | config ARCH_FORCE_MAX_ORDER int "Maximum zone order" if ARM64_4K_PAGES || ARM64_16K_PAGES - default "14" if ARM64_64K_PAGES - range 12 14 if ARM64_16K_PAGES - default "12" if ARM64_16K_PAGES - range 11 16 if ARM64_4K_PAGES - default "11" + default "13" if ARM64_64K_PAGES + range 11 13 if ARM64_16K_PAGES + default "11" if ARM64_16K_PAGES + range 10 15 if ARM64_4K_PAGES + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -1500,14 +1500,11 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - We make sure that we can allocate up to a HugePage size for each configuration. Hence we have : - MAX_ORDER = (PMD_SHIFT - PAGE_SHIFT) + 1 => PAGE_SHIFT - 2 + MAX_ORDER = PMD_SHIFT - PAGE_SHIFT => PAGE_SHIFT - 3 - However for 4K, we choose a higher default value, 11 as opposed to 10, giving us + However for 4K, we choose a higher default value, 10 as opposed to 9, giving us 4M allocations matching the default size used by generic code. config UNMAP_KERNEL_AT_EL0 diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h index 4b73463423c3..5f5437621029 100644 --- a/arch/arm64/include/asm/sparsemem.h +++ b/arch/arm64/include/asm/sparsemem.h @@ -10,7 +10,7 @@ /* * Section size must be at least 512MB for 64K base * page size config. Otherwise it will be less than - * (MAX_ORDER - 1) and the build process will fail. + * MAX_ORDER and the build process will fail. */ #ifdef CONFIG_ARM64_64K_PAGES #define SECTION_SIZE_BITS 29 diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index 0a048dc06a7d..fe5472a184a3 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -16,7 +16,7 @@ struct hyp_pool { * API at EL2. */ hyp_spinlock_t lock; - struct list_head free_area[MAX_ORDER]; + struct list_head free_area[MAX_ORDER + 1]; phys_addr_t range_start; phys_addr_t range_end; unsigned short max_order; diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index 803ba3222e75..b1e392186a0f 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -110,7 +110,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, * after coalescing, so make sure to mark it HYP_NO_ORDER proactively. */ p->order = HYP_NO_ORDER; - for (; (order + 1) < pool->max_order; order++) { + for (; (order + 1) <= pool->max_order; order++) { buddy = __find_buddy_avail(pool, p, order); if (!buddy) break; @@ -203,9 +203,9 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) hyp_spin_lock(&pool->lock); /* Look for a high-enough-order page */ - while (i < pool->max_order && list_empty(&pool->free_area[i])) + while (i <= pool->max_order && list_empty(&pool->free_area[i])) i++; - if (i >= pool->max_order) { + if (i > pool->max_order) { hyp_spin_unlock(&pool->lock); return NULL; } @@ -228,8 +228,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, int i; hyp_spin_lock_init(&pool->lock); - pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT)); - for (i = 0; i < pool->max_order; i++) + pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT)); + for (i = 0; i <= pool->max_order; i++) INIT_LIST_HEAD(&pool->free_area[i]); pool->range_start = phys; pool->range_end = phys + (nr_pages << PAGE_SHIFT); diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index dba02da6fa34..c694fac43bed 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -334,7 +334,7 @@ config HIGHMEM config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "11" + default "10" config DRAM_BASE hex "DRAM start addr (the same with memory-section in dts)" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index d7e4a24e8644..0d2f41fa56ee 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -202,10 +202,10 @@ config IA64_CYCLONE If you're unsure, answer N. config ARCH_FORCE_MAX_ORDER - int "MAX_ORDER (11 - 17)" if !HUGETLB_PAGE - range 11 17 if !HUGETLB_PAGE - default "17" if HUGETLB_PAGE - default "11" + int "MAX_ORDER (10 - 16)" if !HUGETLB_PAGE + range 10 16 if !HUGETLB_PAGE + default "16" if HUGETLB_PAGE + default "10" config SMP bool "Symmetric multi-processing support" diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h index 84e8ce387b69..a58f8b466d96 100644 --- a/arch/ia64/include/asm/sparsemem.h +++ b/arch/ia64/include/asm/sparsemem.h @@ -12,9 +12,9 @@ #define SECTION_SIZE_BITS (30) #define MAX_PHYSMEM_BITS (50) #ifdef CONFIG_ARCH_FORCE_MAX_ORDER -#if ((CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS) +#if (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT > SECTION_SIZE_BITS) #undef SECTION_SIZE_BITS -#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT) +#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT) #endif #endif diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 380d2f3966c9..e8dd4323fb86 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -170,7 +170,7 @@ static int __init hugetlb_setup_sz(char *str) size = memparse(str, &str); if (*str || !is_power_of_2(size) || !(tr_pages & size) || size <= PAGE_SIZE || - size >= (1UL << PAGE_SHIFT << MAX_ORDER)) { + size > (1UL << PAGE_SHIFT << MAX_ORDER)) { printk(KERN_WARNING "Invalid huge page size specified\n"); return 1; } diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 7fd51257e0ed..272a3a12c98d 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -420,12 +420,12 @@ config NODES_SHIFT config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 14 64 if PAGE_SIZE_64KB - default "14" if PAGE_SIZE_64KB - range 12 64 if PAGE_SIZE_16KB - default "12" if PAGE_SIZE_16KB - range 11 64 - default "11" + range 13 63 if PAGE_SIZE_64KB + default "13" if PAGE_SIZE_64KB + range 11 63 if PAGE_SIZE_16KB + default "11" if PAGE_SIZE_16KB + range 10 63 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -434,9 +434,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - The page size is not necessarily 4KB. Keep this in mind when choosing a value for this option. diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index 9380f6e3bb66..c9df6572133f 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -400,7 +400,7 @@ config SINGLE_MEMORY_CHUNK config ARCH_FORCE_MAX_ORDER int "Maximum zone order" if ADVANCED depends on !SINGLE_MEMORY_CHUNK - default "11" + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -413,9 +413,6 @@ config ARCH_FORCE_MAX_ORDER value also defines the minimal size of the hole that allows freeing unused memory map. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - config 060_WRITETHROUGH bool "Use write-through caching for 68060 supervisor accesses" depends on ADVANCED && M68060 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index e2f3ca73f40d..3e8b765b8c7b 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2137,14 +2137,14 @@ endchoice config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 14 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB - default "14" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB - range 13 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB - default "13" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB - range 12 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB - default "12" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB - range 0 64 - default "11" + range 13 63 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB + default "13" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB + range 12 63 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB + default "12" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB + range 11 63 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB + default "11" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB + range 0 63 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -2153,9 +2153,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - The page size is not necessarily 4KB. Keep this in mind when choosing a value for this option. diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index a582f72104f3..89708b95978c 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -46,8 +46,8 @@ source "kernel/Kconfig.hz" config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 9 20 - default "11" + range 8 19 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -56,9 +56,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - endmenu source "arch/nios2/platform/Kconfig.platform" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 49c6d36b2b3e..24d56536b269 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -897,18 +897,18 @@ config DATA_SHIFT config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 8 9 if PPC64 && PPC_64K_PAGES - default "9" if PPC64 && PPC_64K_PAGES - range 13 13 if PPC64 && !PPC_64K_PAGES - default "13" if PPC64 && !PPC_64K_PAGES - range 9 64 if PPC32 && PPC_16K_PAGES - default "9" if PPC32 && PPC_16K_PAGES - range 7 64 if PPC32 && PPC_64K_PAGES - default "7" if PPC32 && PPC_64K_PAGES - range 5 64 if PPC32 && PPC_256K_PAGES - default "5" if PPC32 && PPC_256K_PAGES - range 11 64 - default "11" + range 7 8 if PPC64 && PPC_64K_PAGES + default "8" if PPC64 && PPC_64K_PAGES + range 12 12 if PPC64 && !PPC_64K_PAGES + default "12" if PPC64 && !PPC_64K_PAGES + range 8 63 if PPC32 && PPC_16K_PAGES + default "8" if PPC32 && PPC_16K_PAGES + range 6 63 if PPC32 && PPC_64K_PAGES + default "6" if PPC32 && PPC_64K_PAGES + range 4 63 if PPC32 && PPC_256K_PAGES + default "4" if PPC32 && PPC_256K_PAGES + range 10 63 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -917,9 +917,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - The page size is not necessarily 4KB. For example, on 64-bit systems, 64KB pages can be enabled via CONFIG_PPC_64K_PAGES. Keep this in mind when choosing a value for this option. diff --git a/arch/powerpc/configs/85xx/ge_imp3a_defconfig b/arch/powerpc/configs/85xx/ge_imp3a_defconfig index ea719898b581..6cb7e90d52c1 100644 --- a/arch/powerpc/configs/85xx/ge_imp3a_defconfig +++ b/arch/powerpc/configs/85xx/ge_imp3a_defconfig @@ -30,7 +30,7 @@ CONFIG_PREEMPT=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_BINFMT_MISC=m CONFIG_MATH_EMULATION=y -CONFIG_ARCH_FORCE_MAX_ORDER=17 +CONFIG_ARCH_FORCE_MAX_ORDER=16 CONFIG_PCI=y CONFIG_PCIEPORTBUS=y CONFIG_PCI_MSI=y diff --git a/arch/powerpc/configs/fsl-emb-nonhw.config b/arch/powerpc/configs/fsl-emb-nonhw.config index ab8a8c4530d9..3009b0efaf34 100644 --- a/arch/powerpc/configs/fsl-emb-nonhw.config +++ b/arch/powerpc/configs/fsl-emb-nonhw.config @@ -41,7 +41,7 @@ CONFIG_FIXED_PHY=y CONFIG_FONT_8x16=y CONFIG_FONT_8x8=y CONFIG_FONTS=y -CONFIG_ARCH_FORCE_MAX_ORDER=13 +CONFIG_ARCH_FORCE_MAX_ORDER=12 CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAME_WARN=1024 CONFIG_FTL=y diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 7fcfba162e0d..81d7185e2ae8 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } mmap_read_lock(mm); - chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + chunk = (1UL << (PAGE_SHIFT + MAX_ORDER)) / sizeof(struct vm_area_struct *); chunk = min(chunk, entries); for (entry = 0; entry < entries; entry += chunk) { diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f1ba8d1e8c1a..b900933507da 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -615,7 +615,7 @@ void __init gigantic_hugetlb_cma_reserve(void) order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; if (order) { - VM_WARN_ON(order < MAX_ORDER); + VM_WARN_ON(order <= MAX_ORDER); hugetlb_cma_reserve(order); } } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4f6e20a35aa1..5a81f106068e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1740,7 +1740,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER); /* * We create the default window as big as we can. The constraint is diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig index b52e14ccb450..4d655e8d4d74 100644 --- a/arch/sh/configs/ecovec24_defconfig +++ b/arch/sh/configs/ecovec24_defconfig @@ -8,7 +8,7 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_CPU_SUBTYPE_SH7724=y -CONFIG_ARCH_FORCE_MAX_ORDER=12 +CONFIG_ARCH_FORCE_MAX_ORDER=11 CONFIG_MEMORY_SIZE=0x10000000 CONFIG_FLATMEM_MANUAL=y CONFIG_SH_ECOVEC=y diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 411fdc0901f7..40271090bd7d 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -20,13 +20,13 @@ config PAGE_OFFSET config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 9 64 if PAGE_SIZE_16KB - default "9" if PAGE_SIZE_16KB - range 7 64 if PAGE_SIZE_64KB - default "7" if PAGE_SIZE_64KB - range 11 64 - default "14" if !MMU - default "11" + range 8 63 if PAGE_SIZE_16KB + default "8" if PAGE_SIZE_16KB + range 6 63 if PAGE_SIZE_64KB + default "6" if PAGE_SIZE_64KB + range 10 63 + default "13" if !MMU + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -35,9 +35,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - The page size is not necessarily 4KB. Keep this in mind when choosing a value for this option. diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 84437a4c6545..e3242bf5a8df 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -271,7 +271,7 @@ config ARCH_SPARSEMEM_DEFAULT config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "13" + default "12" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -280,9 +280,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 13 means that the largest free memory block is 2^12 pages. - if SPARC64 || COMPILE_TEST source "kernel/power/Kconfig" endif diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index 384480971805..7d91ca6aa675 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -193,7 +193,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, size = IO_PAGE_ALIGN(size); order = get_order(size); - if (unlikely(order >= MAX_ORDER)) + if (unlikely(order > MAX_ORDER)) return NULL; npages = size >> IO_PAGE_SHIFT; diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 5b4de4a89dec..08ffd17d5ec3 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void) /* Now allocate error trap reporting scoreboard. */ sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info)); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { if ((PAGE_SIZE << order) >= sz) break; } diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index dba8dffe2113..5e2931a18409 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -402,8 +402,8 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss) unsigned long new_rss_limit; gfp_t gfp_flags; - if (max_tsb_size > (PAGE_SIZE << (MAX_ORDER - 1))) - max_tsb_size = (PAGE_SIZE << (MAX_ORDER - 1)); + if (max_tsb_size > PAGE_SIZE << MAX_ORDER) + max_tsb_size = PAGE_SIZE << MAX_ORDER; new_cache_index = 0; for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) { diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 5e5a9c8e0e5d..8dcda617b8bf 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -368,10 +368,10 @@ int __init linux_main(int argc, char **argv) max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; /* - * Zones have to begin on a 1 << MAX_ORDER-1 page boundary, + * Zones have to begin on a 1 << MAX_ORDER page boundary, * so this makes sure that's true for highmem */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER - 1)) - 1); + max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1); if (physmem_size + iomem_size > max_physmem) { highmem = physmem_size + iomem_size - max_physmem; physmem_size -= highmem; diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index bcb0c5d2abc2..3eee334ba873 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -773,7 +773,7 @@ config HIGHMEM config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "11" + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -782,9 +782,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - endmenu menu "Power management options" diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index 817eda2075aa..c491fabe3617 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -226,8 +226,8 @@ static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from, if (*ppos < 0 || !count) return -EINVAL; - if (count > (PAGE_SIZE << (MAX_ORDER - 1))) - count = PAGE_SIZE << (MAX_ORDER - 1); + if (count > (PAGE_SIZE << MAX_ORDER)) + count = PAGE_SIZE << MAX_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) @@ -373,8 +373,8 @@ static ssize_t regmap_reg_ranges_read_file(struct file *file, if (*ppos < 0 || !count) return -EINVAL; - if (count > (PAGE_SIZE << (MAX_ORDER - 1))) - count = PAGE_SIZE << (MAX_ORDER - 1); + if (count > (PAGE_SIZE << MAX_ORDER)) + count = PAGE_SIZE << MAX_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 90d2dfb6448e..cec2c20f5e59 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3079,7 +3079,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr) } } -#define MAX_LEN (1UL << (MAX_ORDER - 1) << PAGE_SHIFT) +#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT) static int raw_cmd_copyin(int cmd, void __user *param, struct floppy_raw_cmd **rcmd) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index e2f25926eb51..bf095baca244 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -886,7 +886,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp) /* * The length of the ID shouldn't be assumed by software since * it may change in the future. The allocation size is limited - * to 1 << (PAGE_SHIFT + MAX_ORDER - 1) by the page allocator. + * to 1 << (PAGE_SHIFT + MAX_ORDER) by the page allocator. * If the allocation fails, simply return ENOMEM rather than * warning in the kernel log. */ diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c index 09586a837b1e..3df7a256e919 100644 --- a/drivers/crypto/hisilicon/sgl.c +++ b/drivers/crypto/hisilicon/sgl.c @@ -70,11 +70,11 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev, HISI_ACC_SGL_ALIGN_SIZE); /* - * the pool may allocate a block of memory of size PAGE_SIZE * 2^(MAX_ORDER - 1), + * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_ORDER, * block size may exceed 2^31 on ia64, so the max of block size is 2^31 */ - block_size = 1 << (PAGE_SHIFT + MAX_ORDER <= 32 ? - PAGE_SHIFT + MAX_ORDER - 1 : 31); + block_size = 1 << (PAGE_SHIFT + MAX_ORDER < 32 ? + PAGE_SHIFT + MAX_ORDER : 31); sgl_num_per_block = block_size / sgl_size; block_num = count / sgl_num_per_block; remain_sgl = count % sgl_num_per_block; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c index eae9e9f6d3bf..6bc26b4b06b8 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c @@ -36,7 +36,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj) struct sg_table *st; struct scatterlist *sg; unsigned int npages; /* restricted by sg_alloc_table */ - int max_order = MAX_ORDER - 1; + int max_order = MAX_ORDER; unsigned int max_segment; gfp_t gfp; diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c index defece0bcb81..99f39a5feca1 100644 --- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c +++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c @@ -115,7 +115,7 @@ static int get_huge_pages(struct drm_i915_gem_object *obj) do { struct page *page; - GEM_BUG_ON(order >= MAX_ORDER); + GEM_BUG_ON(order > MAX_ORDER); page = alloc_pages(GFP | __GFP_ZERO, order); if (!page) goto err; diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index aa116a7bbae3..6c8585abe08d 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644); static atomic_long_t allocated_pages; -static struct ttm_pool_type global_write_combined[MAX_ORDER]; -static struct ttm_pool_type global_uncached[MAX_ORDER]; +static struct ttm_pool_type global_write_combined[MAX_ORDER + 1]; +static struct ttm_pool_type global_uncached[MAX_ORDER + 1]; -static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER]; -static struct ttm_pool_type global_dma32_uncached[MAX_ORDER]; +static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1]; +static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1]; static spinlock_t shrinker_lock; static struct list_head shrinker_list; @@ -405,7 +405,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt, else gfp_flags |= GFP_HIGHUSER; - for (order = min_t(unsigned int, MAX_ORDER - 1, __fls(num_pages)); + for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages)); num_pages; order = min_t(unsigned int, order, __fls(num_pages))) { struct ttm_pool_type *pt; @@ -542,7 +542,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, if (use_dma_alloc) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j < MAX_ORDER; ++j) + for (j = 0; j <= MAX_ORDER; ++j) ttm_pool_type_init(&pool->caching[i].orders[j], pool, i, j); } @@ -562,7 +562,7 @@ void ttm_pool_fini(struct ttm_pool *pool) if (pool->use_dma_alloc) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j < MAX_ORDER; ++j) + for (j = 0; j <= MAX_ORDER; ++j) ttm_pool_type_fini(&pool->caching[i].orders[j]); } @@ -616,7 +616,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m) unsigned int i; seq_puts(m, "\t "); - for (i = 0; i < MAX_ORDER; ++i) + for (i = 0; i <= MAX_ORDER; ++i) seq_printf(m, " ---%2u---", i); seq_puts(m, "\n"); } @@ -627,7 +627,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt, { unsigned int i; - for (i = 0; i < MAX_ORDER; ++i) + for (i = 0; i <= MAX_ORDER; ++i) seq_printf(m, " %8u", ttm_pool_type_count(&pt[i])); seq_puts(m, "\n"); } @@ -736,7 +736,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) spin_lock_init(&shrinker_lock); INIT_LIST_HEAD(&shrinker_list); - for (i = 0; i < MAX_ORDER; ++i) { + for (i = 0; i <= MAX_ORDER; ++i) { ttm_pool_type_init(&global_write_combined[i], NULL, ttm_write_combined, i); ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i); @@ -769,7 +769,7 @@ void ttm_pool_mgr_fini(void) { unsigned int i; - for (i = 0; i < MAX_ORDER; ++i) { + for (i = 0; i <= MAX_ORDER; ++i) { ttm_pool_type_fini(&global_write_combined[i]); ttm_pool_type_fini(&global_uncached[i]); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 8d772ea8a583..b574c58a3487 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -182,7 +182,7 @@ #ifdef CONFIG_CMA_ALIGNMENT #define Q_MAX_SZ_SHIFT (PAGE_SHIFT + CONFIG_CMA_ALIGNMENT) #else -#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER - 1) +#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER) #endif /* diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index ac996fd6bd9c..7a9f0b0bddbd 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -736,7 +736,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev, struct page **pages; unsigned int i = 0, nid = dev_to_node(dev); - order_mask &= GENMASK(MAX_ORDER - 1, 0); + order_mask &= GENMASK(MAX_ORDER, 0); if (!order_mask) return NULL; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 586271b8aa39..85790b870877 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2440,8 +2440,8 @@ static bool its_parse_indirect_baser(struct its_node *its, * feature is not supported by hardware. */ new_order = max_t(u32, get_order(esz << ids), new_order); - if (new_order >= MAX_ORDER) { - new_order = MAX_ORDER - 1; + if (new_order > MAX_ORDER) { + new_order = MAX_ORDER; ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz); pr_warn("ITS@%pa: %s Table too large, reduce ids %llu->%u\n", &its->phys_base, its_base_type_string[type], diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index cf077f9b30c3..733053c2eaa0 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -408,7 +408,7 @@ static void __cache_size_refresh(void) * If the allocation may fail we use __get_free_pages. Memory fragmentation * won't have a fatal effect here, but it just causes flushes of some other * buffers and more I/O will be performed. Don't use __get_free_pages if it - * always fails (i.e. order >= MAX_ORDER). + * always fails (i.e. order > MAX_ORDER). * * If the allocation shouldn't fail we use __vmalloc. This is only for the * initial reserve allocation, so there's no risk of wasting all vmalloc diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c index d0e27438a73c..55fc5b80e649 100644 --- a/drivers/misc/genwqe/card_dev.c +++ b/drivers/misc/genwqe/card_dev.c @@ -443,7 +443,7 @@ static int genwqe_mmap(struct file *filp, struct vm_area_struct *vma) if (vsize == 0) return -EINVAL; - if (get_order(vsize) >= MAX_ORDER) + if (get_order(vsize) > MAX_ORDER) return -ENOMEM; dma_map = kzalloc(sizeof(struct dma_mapping), GFP_KERNEL); diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index ac29698d085a..1c798d6b2dfb 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -210,7 +210,7 @@ u32 genwqe_crc32(u8 *buff, size_t len, u32 init) void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size, dma_addr_t *dma_handle) { - if (get_order(size) >= MAX_ORDER) + if (get_order(size) > MAX_ORDER) return NULL; return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle, @@ -308,7 +308,7 @@ int genwqe_alloc_sync_sgl(struct genwqe_dev *cd, struct genwqe_sgl *sgl, sgl->write = write; sgl->sgl_size = genwqe_sgl_size(sgl->nr_pages); - if (get_order(sgl->sgl_size) >= MAX_ORDER) { + if (get_order(sgl->sgl_size) > MAX_ORDER) { dev_err(&pci_dev->dev, "[%s] err: too much memory requested!\n", __func__); return ret; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 25be7f8ac7cd..3973ca6adf4c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1041,7 +1041,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) return; order = get_order(alloc_size); - if (order >= MAX_ORDER) { + if (order > MAX_ORDER) { if (net_ratelimit()) dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n"); return; diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index b35c9b6f913b..4e18b4cefa97 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -75,7 +75,7 @@ * pool for the 4MB. Thus the 16 Rx and Tx queues require 32 * 5 = 160 * plus 16 for the TSO pools for a total of 176 LTB mappings per VNIC. */ -#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << (MAX_ORDER - 1)) * PAGE_SIZE)) +#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << MAX_ORDER) * PAGE_SIZE)) #define IBMVNIC_ONE_LTB_SIZE min((u32)(8 << 20), IBMVNIC_ONE_LTB_MAX) #define IBMVNIC_LTB_SET_SIZE (38 << 20) diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index ec3f6cf05f8c..34781dec3856 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -946,7 +946,7 @@ static phys_addr_t hvfb_get_phymem(struct hv_device *hdev, if (request_size == 0) return -1; - if (order < MAX_ORDER) { + if (order <= MAX_ORDER) { /* Call alloc_pages if the size is less than 2^MAX_ORDER */ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); if (!page) @@ -977,7 +977,7 @@ static void hvfb_release_phymem(struct hv_device *hdev, { unsigned int order = get_order(size); - if (order < MAX_ORDER) + if (order <= MAX_ORDER) __free_pages(pfn_to_page(paddr >> PAGE_SHIFT), order); else dma_free_coherent(&hdev->device, diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c index 0374ee6b6d03..32e74e02a02f 100644 --- a/drivers/video/fbdev/vermilion/vermilion.c +++ b/drivers/video/fbdev/vermilion/vermilion.c @@ -197,7 +197,7 @@ static int vmlfb_alloc_vram(struct vml_info *vinfo, va = &vinfo->vram[i]; order = 0; - while (requested > (PAGE_SIZE << order) && order < MAX_ORDER) + while (requested > (PAGE_SIZE << order) && order <= MAX_ORDER) order++; err = vmlfb_alloc_vram_area(va, order, 0); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 3f78a3a1eb75..5b15936a5214 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -33,7 +33,7 @@ #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ __GFP_NOMEMALLOC) /* The order of free page blocks to report to host */ -#define VIRTIO_BALLOON_HINT_BLOCK_ORDER (MAX_ORDER - 1) +#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER /* The size of a free page block in bytes */ #define VIRTIO_BALLOON_HINT_BLOCK_BYTES \ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 0c2892ec6817..835f6cc2fb66 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1120,13 +1120,13 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn, */ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) { - unsigned long order = MAX_ORDER - 1; + unsigned long order = MAX_ORDER; unsigned long i; /* * We might get called for ranges that don't cover properly aligned - * MAX_ORDER - 1 pages; however, we can only online properly aligned - * pages with an order of MAX_ORDER - 1 at maximum. + * MAX_ORDER pages; however, we can only online properly aligned + * pages with an order of MAX_ORDER at maximum. */ while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) order--; @@ -1237,9 +1237,9 @@ static void virtio_mem_online_page(struct virtio_mem *vm, bool do_online; /* - * We can get called with any order up to MAX_ORDER - 1. If our - * subblock size is smaller than that and we have a mixture of plugged - * and unplugged subblocks within such a page, we have to process in + * We can get called with any order up to MAX_ORDER. If our subblock + * size is smaller than that and we have a mixture of plugged and + * unplugged subblocks within such a page, we have to process in * smaller granularity. In that case we'll adjust the order exactly once * within the loop. */ diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2f67516bb9bf..9fbb9b5256f7 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); - if (unlikely(order >= MAX_ORDER)) + if (unlikely(order > MAX_ORDER)) return -EFBIG; ret = inode_newsize_ok(inode, newsize); diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h index ef09b23d29e3..8ce14f9d202a 100644 --- a/include/drm/ttm/ttm_pool.h +++ b/include/drm/ttm/ttm_pool.h @@ -72,7 +72,7 @@ struct ttm_pool { bool use_dma32; struct { - struct ttm_pool_type orders[MAX_ORDER]; + struct ttm_pool_type orders[MAX_ORDER + 1]; } caching[TTM_NUM_CACHING_TYPES]; }; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7c977d234aba..8fb7d91cd0b1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -818,7 +818,7 @@ static inline unsigned huge_page_shift(struct hstate *h) static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) >= MAX_ORDER; + return huge_page_order(h) > MAX_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bf8786d45b31..35b11cc210d9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -26,11 +26,11 @@ /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER -#define MAX_ORDER 11 +#define MAX_ORDER 10 #else #define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif -#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) +#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed @@ -93,7 +93,7 @@ static inline bool migratetype_is_mergeable(int mt) } #define for_each_migratetype_order(order, type) \ - for (order = 0; order < MAX_ORDER; order++) \ + for (order = 0; order <= MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; @@ -922,7 +922,7 @@ struct zone { CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ - struct free_area free_area[MAX_ORDER]; + struct free_area free_area[MAX_ORDER + 1]; /* zone flags, see below */ unsigned long flags; @@ -1745,7 +1745,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) -#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS +#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_ORDER exceeds SECTION_SIZE #endif diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 5f1ae07d724b..e83c4c095041 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,14 +41,14 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER - 1) +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order (MAX_ORDER-1) +#define pageblock_order MAX_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 45af70315a94..aa4575ef2965 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -284,7 +284,7 @@ static inline unsigned int arch_slab_minalign(void) * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif @@ -292,7 +292,7 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLUB #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif @@ -305,7 +305,7 @@ static inline unsigned int arch_slab_minalign(void) * be allocated from the same page. */ #define KMALLOC_SHIFT_HIGH PAGE_SHIFT -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 755f5f08ab38..90ce1dfd591c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -474,7 +474,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(list_head, prev); VMCOREINFO_OFFSET(vmap_area, va_start); VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 4d40dcce7604..1acec2e22827 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, void *addr; int ret = -ENOMEM; - /* Cannot allocate larger than MAX_ORDER-1 */ - order = min(get_order(pool_size), MAX_ORDER-1); + /* Cannot allocate larger than MAX_ORDER */ + order = min(get_order(pool_size), MAX_ORDER); do { pool_size = 1 << (PAGE_SHIFT + order); @@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void) /* * If coherent_pool was not used on the command line, default the pool - * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1. + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER. */ if (!atomic_pool_size) { unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index d6bbdb7830b2..a0433f37b024 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -609,8 +609,8 @@ static struct page *rb_alloc_aux_page(int node, int order) { struct page *page; - if (order >= MAX_ORDER) - order = MAX_ORDER - 1; + if (order > MAX_ORDER) + order = MAX_ORDER; do { page = alloc_pages_node(node, PERF_AUX_GFP, order); @@ -814,7 +814,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) + if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER) goto fail; node = (cpu == -1) ? cpu : cpu_to_node(cpu); diff --git a/mm/Kconfig b/mm/Kconfig index ca98b2072df5..969286ab14a1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -346,9 +346,9 @@ config SHUFFLE_PAGE_ALLOCATOR the presence of a memory-side-cache. There are also incidental security benefits as it reduces the predictability of page allocations to compliment SLAB_FREELIST_RANDOM, but the - default granularity of shuffling on the "MAX_ORDER - 1" i.e, - 10th order of pages is selected based on cache utilization - benefits on x86. + default granularity of shuffling on the MAX_ORDER i.e, 10th + order of pages is selected based on cache utilization benefits + on x86. While the randomization improves cache utilization it may negatively impact workloads on platforms without a cache. For @@ -666,8 +666,8 @@ config HUGETLB_PAGE_SIZE_VARIABLE HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available on a platform. - Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be - clamped down to MAX_ORDER - 1. + Note that the pageblock_order cannot exceed MAX_ORDER and will be + clamped down to MAX_ORDER. config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA diff --git a/mm/compaction.c b/mm/compaction.c index 5a9501e0ae01..709136556b9e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -583,7 +583,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (likely(order < MAX_ORDER)) { + if (likely(order <= MAX_ORDER)) { blockpfn += (1UL << order) - 1; cursor += (1UL << order) - 1; } @@ -938,7 +938,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * a valid page order. Consider only values in the * valid order range to prevent low_pfn overflow. */ - if (freepage_order > 0 && freepage_order < MAX_ORDER) + if (freepage_order > 0 && freepage_order <= MAX_ORDER) low_pfn += (1UL << freepage_order) - 1; continue; } @@ -954,7 +954,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (PageCompound(page) && !cc->alloc_contig) { const unsigned int order = compound_order(page); - if (likely(order < MAX_ORDER)) + if (likely(order <= MAX_ORDER)) low_pfn += (1UL << order) - 1; goto isolate_fail; } @@ -2124,7 +2124,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) /* Direct compactor: Is a suitable page free? */ ret = COMPACT_NO_SUITABLE_PAGE; - for (order = cc->order; order < MAX_ORDER; order++) { + for (order = cc->order; order <= MAX_ORDER; order++) { struct free_area *area = &cc->zone->free_area[order]; bool can_steal; diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 4362021b1ce7..c54177aabebd 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1086,7 +1086,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) struct page *page = NULL; #ifdef CONFIG_CONTIG_ALLOC - if (order >= MAX_ORDER) { + if (order > MAX_ORDER) { page = alloc_contig_pages((1 << order), GFP_KERNEL, first_online_node, NULL); if (page) { @@ -1096,7 +1096,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) } #endif - if (order < MAX_ORDER) + if (order <= MAX_ORDER) page = alloc_pages(GFP_KERNEL, order); return page; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2d860e70fe88..1df386d13d24 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -467,7 +467,7 @@ static int __init hugepage_init(void) /* * hugepages can't be allocated by the buddy allocator */ - MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER); /* * we use page->mapping and page->index in second tail page * as list_head: assuming THP order >= 2 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 712e32b38295..9122e50ae02a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2090,7 +2090,7 @@ pgoff_t hugetlb_basepage_index(struct page *page) pgoff_t index = page_index(page_head); unsigned long compound_idx; - if (compound_order(page_head) >= MAX_ORDER) + if (compound_order(page_head) > MAX_ORDER) compound_idx = page_to_pfn(page) - page_to_pfn(page_head); else compound_idx = page - page_head; @@ -4497,7 +4497,7 @@ static int __init default_hugepagesz_setup(char *s) * The number of default huge pages (for this size) could have been * specified as the first hugetlb parameter: hugepages=X. If so, * then default_hstate_max_huge_pages is set. If the default huge - * page size is gigantic (>= MAX_ORDER), then the pages must be + * page size is gigantic (> MAX_ORDER), then the pages must be * allocated here from bootmem allocator. */ if (default_hstate_max_huge_pages) { diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index 7fb794242fad..ffedf4dbc49d 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void) struct metadata_page_pair { struct page *shadow, *origin; }; -static struct metadata_page_pair held_back[MAX_ORDER] __initdata; +static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata; /* * Eager metadata allocation. When the memblock allocator is freeing pages to @@ -211,8 +211,8 @@ static void kmsan_memblock_discard(void) * order=N-1, * - repeat. */ - collect.order = MAX_ORDER - 1; - for (int i = MAX_ORDER - 1; i >= 0; i--) { + collect.order = MAX_ORDER; + for (int i = MAX_ORDER; i >= 0; i--) { if (held_back[i].shadow) smallstack_push(&collect, held_back[i].shadow); if (held_back[i].origin) diff --git a/mm/memblock.c b/mm/memblock.c index 25fd0626a9e7..7911224b1ed3 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2043,7 +2043,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) int order; while (start < end) { - order = min(MAX_ORDER - 1UL, __ffs(start)); + order = min_t(int, MAX_ORDER, __ffs(start)); while (start + (1UL << order) > end) order--; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index db3b270254f1..c8f0a8c2d049 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -596,7 +596,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) unsigned long pfn; /* - * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might + * Online the pages in MAX_ORDER aligned chunks. The callback might * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). @@ -605,7 +605,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * this and the first chunk to online will be pageblock_nr_pages. */ for (pfn = start_pfn; pfn < end_pfn;) { - int order = min(MAX_ORDER - 1UL, __ffs(pfn)); + int order = min_t(int, MAX_ORDER, __ffs(pfn)); (*online_page_callback)(pfn_to_page(pfn), order); pfn += (1UL << order); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0936bde1d486..c3e49d028a7a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1063,7 +1063,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, unsigned long higher_page_pfn; struct page *higher_page; - if (order >= MAX_ORDER - 2) + if (order >= MAX_ORDER - 1) return false; higher_page_pfn = buddy_pfn & pfn; @@ -1118,7 +1118,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER - 1) { + while (order < MAX_ORDER) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -2499,7 +2499,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, struct page *page; /* Find a page of the appropriate size in the preferred list */ - for (current_order = order; current_order < MAX_ORDER; ++current_order) { + for (current_order = order; current_order <= MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); page = get_page_from_free_area(area, migratetype); if (!page) @@ -2871,7 +2871,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, continue; spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { struct free_area *area = &(zone->free_area[order]); page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); @@ -2955,7 +2955,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, * approximates finding the pageblock with the most free pages, which * would be too costly to do exactly. */ - for (current_order = MAX_ORDER - 1; current_order >= min_order; + for (current_order = MAX_ORDER; current_order >= min_order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2981,7 +2981,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, return false; find_smallest: - for (current_order = order; current_order < MAX_ORDER; + for (current_order = order; current_order <= MAX_ORDER; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2994,7 +2994,7 @@ find_smallest: * This should not happen - we already found a suitable fallback * when looking for the largest page. */ - VM_BUG_ON(current_order == MAX_ORDER); + VM_BUG_ON(current_order > MAX_ORDER); do_steal: page = get_page_from_free_area(area, fallback_mt); @@ -3955,7 +3955,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; /* For a high-order request, check at least one suitable page is free */ - for (o = order; o < MAX_ORDER; o++) { + for (o = order; o <= MAX_ORDER; o++) { struct free_area *area = &z->free_area[o]; int mt; @@ -5475,7 +5475,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, * There are several places where we assume that the order value is sane * so bail out early if the request is out of bound. */ - if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp)) + if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)) return NULL; gfp &= gfp_allowed_mask; @@ -6205,8 +6205,8 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i for_each_populated_zone(zone) { unsigned int order; - unsigned long nr[MAX_ORDER], flags, total = 0; - unsigned char types[MAX_ORDER]; + unsigned long nr[MAX_ORDER + 1], flags, total = 0; + unsigned char types[MAX_ORDER + 1]; if (zone_idx(zone) > max_zone_idx) continue; @@ -6216,7 +6216,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { struct free_area *area = &zone->free_area[order]; int type; @@ -6230,7 +6230,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i } } spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { printk(KERN_CONT "%lu*%lukB ", nr[order], K(1UL) << order); if (nr[order]) @@ -7581,7 +7581,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_ORDER - 1; + unsigned int order = MAX_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) @@ -9076,7 +9076,7 @@ void *__init alloc_large_system_hash(const char *tablename, else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); - } else if (get_order(size) >= MAX_ORDER || hashdist) { + } else if (get_order(size) > MAX_ORDER || hashdist) { table = vmalloc_huge(size, gfp_flags); virt = true; if (table) @@ -9290,7 +9290,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { - if (++order >= MAX_ORDER) { + if (++order > MAX_ORDER) { outer_start = start; break; } @@ -9540,7 +9540,7 @@ bool is_free_buddy_page(struct page *page) unsigned long pfn = page_to_pfn(page); unsigned int order; - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); if (PageBuddy(page_head) && @@ -9548,7 +9548,7 @@ bool is_free_buddy_page(struct page *page) break; } - return order < MAX_ORDER; + return order <= MAX_ORDER; } EXPORT_SYMBOL(is_free_buddy_page); @@ -9599,7 +9599,7 @@ bool take_page_off_buddy(struct page *page) bool ret = false; spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); int page_order = buddy_order(page_head); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 47fbc1696466..c6f3605e37ab 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -226,7 +226,7 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) */ if (PageBuddy(page)) { order = buddy_order(page); - if (order >= pageblock_order && order < MAX_ORDER - 1) { + if (order >= pageblock_order && order < MAX_ORDER) { buddy = find_buddy_page_pfn(page, page_to_pfn(page), order, NULL); if (buddy && !is_migrate_isolate_page(buddy)) { @@ -290,11 +290,11 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * isolate_single_pageblock() * @migratetype: migrate type to set in error recovery. * - * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one + * Free and in-use pages can be as big as MAX_ORDER and contain more than one * pageblock. When not all pageblocks within a page are isolated at the same * time, free page accounting can go wrong. For example, in the case of - * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks. - * [ MAX_ORDER-1 ] + * MAX_ORDER = pageblock_order + 1, a MAX_ORDER page has two pagelbocks. + * [ MAX_ORDER ] * [ pageblock0 | pageblock1 ] * When either pageblock is isolated, if it is a free page, the page is not * split into separate migratetype lists, which is supposed to; if it is an @@ -451,7 +451,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, * the free page to the right migratetype list. * * head_pfn is not used here as a hugetlb page order - * can be bigger than MAX_ORDER-1, but after it is + * can be bigger than MAX_ORDER, but after it is * freed, the free page order is not. Use pfn within * the range to find the head of the free page. */ @@ -459,7 +459,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, outer_pfn = pfn; while (!PageBuddy(pfn_to_page(outer_pfn))) { /* stop if we cannot find the free page */ - if (++order >= MAX_ORDER) + if (++order > MAX_ORDER) goto failed; outer_pfn &= ~0UL << order; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 220cdeddc295..31169b3e7f06 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -315,7 +315,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, unsigned long freepage_order; freepage_order = buddy_order_unsafe(page); - if (freepage_order < MAX_ORDER) + if (freepage_order <= MAX_ORDER) pfn += (1UL << freepage_order) - 1; continue; } @@ -549,7 +549,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); - if (freepage_order < MAX_ORDER) + if (freepage_order <= MAX_ORDER) pfn += (1UL << freepage_order) - 1; continue; } @@ -657,7 +657,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageBuddy(page)) { unsigned long order = buddy_order_unsafe(page); - if (order > 0 && order < MAX_ORDER) + if (order > 0 && order <= MAX_ORDER) pfn += (1UL << order) - 1; continue; } diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 275b466de37b..b021f482a4cb 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -20,7 +20,7 @@ static int page_order_update_notify(const char *val, const struct kernel_param * * If param is set beyond this limit, order is set to default * pageblock_order value */ - return param_set_uint_minmax(val, kp, 0, MAX_ORDER-1); + return param_set_uint_minmax(val, kp, 0, MAX_ORDER); } static const struct kernel_param_ops page_reporting_param_ops = { @@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev, return err; /* Process each free list starting from lowest order/mt */ - for (order = page_reporting_order; order < MAX_ORDER; order++) { + for (order = page_reporting_order; order <= MAX_ORDER; order++) { for (mt = 0; mt < MIGRATE_TYPES; mt++) { /* We do not pull pages from the isolate free list */ if (is_migrate_isolate(mt)) @@ -370,7 +370,7 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) */ if (page_reporting_order == -1) { - if (prdev->order > 0 && prdev->order < MAX_ORDER) + if (prdev->order > 0 && prdev->order <= MAX_ORDER) page_reporting_order = prdev->order; else page_reporting_order = pageblock_order; diff --git a/mm/shuffle.h b/mm/shuffle.h index cec62984f7d3..a6bdf54f96f1 100644 --- a/mm/shuffle.h +++ b/mm/shuffle.h @@ -4,7 +4,7 @@ #define _MM_SHUFFLE_H #include -#define SHUFFLE_ORDER (MAX_ORDER-1) +#define SHUFFLE_ORDER MAX_ORDER #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); diff --git a/mm/slab.c b/mm/slab.c index edbe722fb906..6b7c172158e5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -465,7 +465,7 @@ static int __init slab_max_order_setup(char *str) { get_option(&str, &slab_max_order); slab_max_order = slab_max_order < 0 ? 0 : - min(slab_max_order, MAX_ORDER - 1); + min(slab_max_order, MAX_ORDER); slab_max_order_set = true; return 1; diff --git a/mm/slub.c b/mm/slub.c index 32eb6b50fe18..f49d669ff604 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4171,8 +4171,8 @@ static inline int calculate_order(unsigned int size) /* * Doh this slab cannot be placed using slub_max_order. */ - order = calc_slab_order(size, 1, MAX_ORDER - 1, 1); - if (order < MAX_ORDER) + order = calc_slab_order(size, 1, MAX_ORDER, 1); + if (order <= MAX_ORDER) return order; return -ENOSYS; } @@ -4697,7 +4697,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, (int *)&slub_max_order); - slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1); + slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER); return 1; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 8faac4310cb5..98719e72b5e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7002,7 +7002,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. */ - BUILD_BUG_ON(MAX_ORDER > S8_MAX); + BUILD_BUG_ON(MAX_ORDER >= S8_MAX); BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); diff --git a/mm/vmstat.c b/mm/vmstat.c index 1ea6a5ce1c41..b7307627772d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1055,7 +1055,7 @@ static void fill_contig_page_info(struct zone *zone, info->free_blocks_total = 0; info->free_blocks_suitable = 0; - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { unsigned long blocks; /* @@ -1088,7 +1088,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in { unsigned long requested = 1UL << order; - if (WARN_ON_ONCE(order >= MAX_ORDER)) + if (WARN_ON_ONCE(order > MAX_ORDER)) return 0; if (!info->free_blocks_total) @@ -1462,7 +1462,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, int order; seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) + for (order = 0; order <= MAX_ORDER; ++order) /* * Access to nr_free is lockless as nr_free is used only for * printing purposes. Use data_race to avoid KCSAN warning. @@ -1491,7 +1491,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, pgdat->node_id, zone->name, migratetype_names[mtype]); - for (order = 0; order < MAX_ORDER; ++order) { + for (order = 0; order <= MAX_ORDER; ++order) { unsigned long freecount = 0; struct free_area *area; struct list_head *curr; @@ -1531,7 +1531,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg) /* Print header */ seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); - for (order = 0; order < MAX_ORDER; ++order) + for (order = 0; order <= MAX_ORDER; ++order) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); @@ -2153,7 +2153,7 @@ static void unusable_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { + for (order = 0; order <= MAX_ORDER; ++order) { fill_contig_page_info(zone, order, &info); index = unusable_free_index(order, &info); seq_printf(m, "%d.%03d ", index / 1000, index % 1000); @@ -2205,7 +2205,7 @@ static void extfrag_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { + for (order = 0; order <= MAX_ORDER; ++order) { fill_contig_page_info(zone, order, &info); index = __fragmentation_index(order, &info); seq_printf(m, "%2d.%03d ", index / 1000, index % 1000); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 854772dd52fd..9b66d6aeeb1a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -843,7 +843,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) goto out; /* the calculated number of cq entries fits to mlx5 cq allocation */ cqe_size_order = cache_line_size() == 128 ? 7 : 6; - smc_order = MAX_ORDER - cqe_size_order - 1; + smc_order = MAX_ORDER - cqe_size_order; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 64499056648a..51ad29940f05 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -38,7 +38,7 @@ static int param_set_bufsize(const char *val, const struct kernel_param *kp) size = memparse(val, NULL); order = get_order(size); - if (order >= MAX_ORDER) + if (order > MAX_ORDER) return -EINVAL; ima_maxorder = order; ima_bufsize = PAGE_SIZE << order; diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h index e65f89b12f1c..134f8eab0768 100644 --- a/tools/testing/memblock/linux/mmzone.h +++ b/tools/testing/memblock/linux/mmzone.h @@ -17,10 +17,10 @@ enum zone_type { }; #define MAX_NR_ZONES __MAX_NR_ZONES -#define MAX_ORDER 11 -#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) +#define MAX_ORDER 10 +#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) -#define pageblock_order (MAX_ORDER - 1) +#define pageblock_order MAX_ORDER #define pageblock_nr_pages BIT(pageblock_order) #define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) -- cgit v1.2.3 From 0289184476c845968ad6ac9083c96cc0f75ca505 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:50 -0700 Subject: mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new PTE to resolve a missing fault, one can install a write-protected one. This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in combination. This was motivated by testing HugeTLB HGM [1], and in particular its interaction with userfaultfd features. Existing userfaultfd code supports using WP and MINOR modes together (i.e. you can register an area with both enabled), but without this CONTINUE flag the combination is in practice unusable. So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing as UFFDIO_COPY_MODE_WP, but for *minor* faults. Update the selftest to do some very basic exercising of the new flag. Update Documentation/ to describe how these flags are used (neither the COPY nor the new CONTINUE versions of this mode flag were described there before). [1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/ Link: https://lkml.kernel.org/r/20230314221250.682452-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/userfaultfd.rst | 8 ++++++++ fs/userfaultfd.c | 8 ++++++-- include/linux/userfaultfd_k.h | 3 ++- include/uapi/linux/userfaultfd.h | 7 +++++++ mm/userfaultfd.c | 5 +++-- tools/testing/selftests/mm/userfaultfd.c | 4 ++++ 6 files changed, 30 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index bd2226299583..7c304e432205 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -236,6 +236,14 @@ newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED and set the feature bit in advance to make sure none ptes will also be write protected even upon anonymous memory. +When using ``UFFDIO_REGISTER_MODE_WP`` in combination with either +``UFFDIO_REGISTER_MODE_MISSING`` or ``UFFDIO_REGISTER_MODE_MINOR``, when +resolving missing / minor faults with ``UFFDIO_COPY`` or ``UFFDIO_CONTINUE`` +respectively, it may be desirable for the new page / mapping to be +write-protected (so future writes will also result in a WP fault). These ioctls +support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP`` +respectively) to configure the mapping this way. + QEMU/KVM ======== diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8971c3613cc6..8395605790f6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1893,6 +1893,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; user_uffdio_continue = (struct uffdio_continue __user *)arg; @@ -1917,13 +1918,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) uffdio_continue.range.start) { goto out; } - if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | + UFFDIO_CONTINUE_MODE_WP)) goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, uffdio_continue.range.len, - &ctx->mmap_changing); + &ctx->mmap_changing, flags); mmput(ctx->mm); } else { return -ESRCH; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 4c477dece540..a2c53e98dfd6 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -83,7 +83,8 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long len, atomic_t *mmap_changing); extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing); + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 90c958952bfc..66dd4cd277bd 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -305,6 +305,13 @@ struct uffdio_writeprotect { struct uffdio_continue { struct uffdio_range range; #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) __u64 mode; /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a9b19b39413d..7f1b5f8b712c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -693,10 +693,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start, } ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags) { return mfill_atomic(dst_mm, start, 0, len, mmap_changing, - uffd_flags_set_mode(0, MFILL_ATOMIC_CONTINUE)); + uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } long uffd_wp_range(struct vm_area_struct *dst_vma, diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index e030d63c031a..a96d126cb40e 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len) req.range.start = start; req.range.len = len; req.mode = 0; + if (test_uffdio_wp) + req.mode |= UFFDIO_CONTINUE_MODE_WP; if (ioctl(ufd, UFFDIO_CONTINUE, &req)) err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, @@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void) uffdio_register.range.start = (unsigned long)area_dst_alias; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); -- cgit v1.2.3 From 3cce258ea4009992b7b4b64c2bebaf98948180ae Mon Sep 17 00:00:00 2001 From: Chaitanya S Prakash Date: Thu, 23 Mar 2023 11:31:19 +0530 Subject: selftests/mm: change MAP_CHUNK_SIZE Patch series "selftests: Fix virtual address range for arm64", v2. When the virtual address range selftest is run on arm64 and x86 platforms, it is observed that both the low and high VA range iterations are skipped when the MAP_CHUNK_SIZE is set to 16GB. The MAP_CHUNK_SIZE is changed to 1GB to resolve this issue, following which support for arm64 platform is added by changing the NR_CHUNKS_HIGH for aarch64 to accommodate up to 4PB of virtual address space allocation requests. Dynamic memory allocation of array holding addresses is introduced to prevent overflow of the stack. Finally, the overcommit_policy is set as OVERCOMMIT_ALWAYS to prevent the kernel from denying a memory allocation request based on a platform's physical memory availability. This patch (of 3): mmap() fails to allocate 16GB virtual space chunk, skipping both low and high VA range iterations. Hence, reduce MAP_CHUNK_SIZE to 1GB and update relevant macros as required. Link: https://lkml.kernel.org/r/20230323060121.1175830-1-chaitanyas.prakash@arm.com Link: https://lkml.kernel.org/r/20230323060121.1175830-2-chaitanyas.prakash@arm.com Signed-off-by: Chaitanya S Prakash Cc: David Hildenbrand Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/virtual_address_range.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index c0592646ed93..50564512c5ee 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -15,11 +15,15 @@ /* * Maximum address range mapped with a single mmap() - * call is little bit more than 16GB. Hence 16GB is + * call is little bit more than 1GB. Hence 1GB is * chosen as the single chunk size for address space * mapping. */ -#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */ + +#define SZ_1GB (1024 * 1024 * 1024UL) +#define SZ_1TB (1024 * 1024 * 1024 * 1024UL) + +#define MAP_CHUNK_SIZE SZ_1GB /* * Address space till 128TB is mapped without any hint @@ -36,7 +40,7 @@ * are supported so far. */ -#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */ +#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */ #define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) #define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) -- cgit v1.2.3 From 3f9bea2b8a7e4952f03582d8137ab8bf6e96985b Mon Sep 17 00:00:00 2001 From: Chaitanya S Prakash Date: Thu, 23 Mar 2023 11:31:20 +0530 Subject: selftests/mm: change NR_CHUNKS_HIGH for aarch64 Although there is a provision for 52 bit VA on arm64 platform, it remains unutilised and higher addresses are not allocated. In order to accommodate 4PB [2^52] virtual address space where supported, NR_CHUNKS_HIGH is changed accordingly. Array holding addresses is changed from static allocation to dynamic allocation to accommodate its voluminous nature which otherwise might overflow the stack. Link: https://lkml.kernel.org/r/20230323060121.1175830-3-chaitanyas.prakash@arm.com Signed-off-by: Chaitanya S Prakash Cc: David Hildenbrand Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/virtual_address_range.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 50564512c5ee..bae0ceaf95b1 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -36,13 +36,15 @@ * till it reaches 512TB. One with size 128TB and the * other being 384TB. * - * On Arm64 the address space is 256TB and no high mappings - * are supported so far. + * On Arm64 the address space is 256TB and support for + * high mappings up to 4PB virtual address space has + * been added. */ #define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */ #define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) #define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) +#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL) #define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ #define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ @@ -51,7 +53,7 @@ #define HIGH_ADDR_MARK ADDR_MARK_256TB #define HIGH_ADDR_SHIFT 49 #define NR_CHUNKS_LOW NR_CHUNKS_256TB -#define NR_CHUNKS_HIGH 0 +#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB #else #define HIGH_ADDR_MARK ADDR_MARK_128TB #define HIGH_ADDR_SHIFT 48 @@ -101,7 +103,7 @@ static int validate_lower_address_hint(void) int main(int argc, char *argv[]) { char *ptr[NR_CHUNKS_LOW]; - char *hptr[NR_CHUNKS_HIGH]; + char **hptr; char *hint; unsigned long i, lchunks, hchunks; @@ -119,6 +121,9 @@ int main(int argc, char *argv[]) return 1; } lchunks = i; + hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *)); + if (hptr == NULL) + return 1; for (i = 0; i < NR_CHUNKS_HIGH; i++) { hint = hind_addr(); @@ -139,5 +144,6 @@ int main(int argc, char *argv[]) for (i = 0; i < hchunks; i++) munmap(hptr[i], MAP_CHUNK_SIZE); + free(hptr); return 0; } -- cgit v1.2.3 From d6c2789778c59c1768567099eb2fdd32d790c38a Mon Sep 17 00:00:00 2001 From: Chaitanya S Prakash Date: Thu, 23 Mar 2023 11:31:21 +0530 Subject: selftests/mm: set overcommit_policy as OVERCOMMIT_ALWAYS The kernel's default behaviour is to obstruct the allocation of high virtual address as it handles memory overcommit in a heuristic manner. Setting the parameter as OVERCOMMIT_ALWAYS, ensures kernel isn't susceptible to the availability of a platform's physical memory when denying a memory allocation request. Link: https://lkml.kernel.org/r/20230323060121.1175830-4-chaitanyas.prakash@arm.com Signed-off-by: Chaitanya S Prakash Cc: David Hildenbrand Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 8984e0bb58c7..c0f93b668c0c 100644 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -220,7 +220,15 @@ CATEGORY="mremap" run_test ./mremap_test CATEGORY="hugetlb" run_test ./thuge-gen if [ $VADDR64 -ne 0 ]; then + + # set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel + # allows high virtual address allocation requests independent + # of platform's physical memory. + + prev_policy=$(cat /proc/sys/vm/overcommit_memory) + echo 1 > /proc/sys/vm/overcommit_memory CATEGORY="hugevm" run_test ./virtual_address_range + echo $prev_policy > /proc/sys/vm/overcommit_memory # virtual address 128TB switch test CATEGORY="hugevm" run_test ./va_128TBswitch.sh -- cgit v1.2.3 From 3b7939c8e5345fb84309f2605a4b6f7ac8dd7fce Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Apr 2023 12:10:05 +0800 Subject: maple_tree: add a test case to check maple_alloc Add a test case to check whether the number of maple_alloc structures is actually equal to mas->alloc->total. Link: https://lkml.kernel.org/r/20230411041005.26205-2-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 4c89ff333f6f..9286d3baa12d 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -55,6 +55,28 @@ struct rcu_reader_struct { struct rcu_test_struct2 *test; }; +static int get_alloc_node_count(struct ma_state *mas) +{ + int count = 1; + struct maple_alloc *node = mas->alloc; + + if (!node || ((unsigned long)node & 0x1)) + return 0; + while (node->node_count) { + count += node->node_count; + node = node->slot[0]; + } + return count; +} + +static void check_mas_alloc_node_count(struct ma_state *mas) +{ + mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 1, GFP_KERNEL); + mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 3, GFP_KERNEL); + MT_BUG_ON(mas->tree, get_alloc_node_count(mas) != mas->alloc->total); + mas_destroy(mas); +} + /* * check_new_node() - Check the creation of new nodes and error path * verification. @@ -69,6 +91,8 @@ static noinline void check_new_node(struct maple_tree *mt) MA_STATE(mas, mt, 0, 0); + check_mas_alloc_node_count(&mas); + /* Try allocating 3 nodes */ mtree_lock(mt); mt_set_non_kernel(0); -- cgit v1.2.3 From d6e61afb40e2208d90470f4b2012c4c8092863b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 11 Apr 2023 16:25:07 +0200 Subject: selftests/mm: reuse read_pmd_pagesize() in COW selftest Patch series "mm: (pte|pmd)_mkdirty() should not unconditionally allow for write access". This is the follow-up on [1], adding selftests (testing for known issues we added workarounds for and other issues that haven't been fixed yet), fixing sparc64, reverting the workarounds, and perform one cleanup. The patch from [1] was modified slightly (updated/extended patch description, dropped one unnecessary NOP instruction from the ASM in __pte_mkhwwrite()). Retested on x86_64 and sparc64 (sun4u in QEMU). I scanned most architectures to make sure their (pte|pmd)_mkdirty() handling is correct. To be sure, we can run the selftests and find out if other architectures are still affectes (loongarch was fixed recently as well). Based on master for now. I don't expect surprises regarding mm-tress, but I can rebase if there are any problems. This patch (of 6): The COW selftest can deal with THP not being configured. So move error handling of read_pmd_pagesize() into the callers such that we can reuse it in the COW selftest. Link: https://lkml.kernel.org/r/20230411142512.438404-1-david@redhat.com Link: https://lkml.kernel.org/r/20221212130213.136267-1-david@redhat.com [1] Link: https://lkml.kernel.org/r/20230411142512.438404-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: David S. Miller Cc: Hugh Dickins Cc: Peter Xu Cc: Sam Ravnborg Cc: Shuah Khan Cc: Yu Zhao Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 33 +++-------------------- tools/testing/selftests/mm/khugepaged.c | 4 +++ tools/testing/selftests/mm/soft-dirty.c | 3 +++ tools/testing/selftests/mm/split_huge_page_test.c | 4 +++ tools/testing/selftests/mm/vm_util.c | 4 +-- 5 files changed, 17 insertions(+), 31 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 0eb2e8180aa5..dc9d6fe86028 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -45,34 +45,6 @@ static size_t hugetlbsizes[10]; static int gup_fd; static bool has_huge_zeropage; -static void detect_thpsize(void) -{ - int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", - O_RDONLY); - size_t size = 0; - char buf[15]; - int ret; - - if (fd < 0) - return; - - ret = pread(fd, buf, sizeof(buf), 0); - if (ret > 0 && ret < sizeof(buf)) { - buf[ret] = 0; - - size = strtoul(buf, NULL, 10); - if (size < pagesize) - size = 0; - if (size > 0) { - thpsize = size; - ksft_print_msg("[INFO] detected THP size: %zu KiB\n", - thpsize / 1024); - } - } - - close(fd); -} - static void detect_huge_zeropage(void) { int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", @@ -1741,7 +1713,10 @@ int main(int argc, char **argv) int err; pagesize = getpagesize(); - detect_thpsize(); + thpsize = read_pmd_pagesize(); + if (thpsize) + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", + thpsize / 1024); detect_hugetlbsizes(); detect_huge_zeropage(); diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 64126c8cd561..97adc0f34f9c 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -1476,6 +1476,10 @@ int main(int argc, const char **argv) page_size = getpagesize(); hpage_pmd_size = read_pmd_pagesize(); + if (!hpage_pmd_size) { + printf("Reading PMD pagesize failed"); + exit(EXIT_FAILURE); + } hpage_pmd_nr = hpage_pmd_size / page_size; default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 21d8830c5f24..cc5f144430d4 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -80,6 +80,9 @@ static void test_hugepage(int pagemap_fd, int pagesize) int i, ret; size_t hpage_len = read_pmd_pagesize(); + if (!hpage_len) + ksft_exit_fail_msg("Reading PMD pagesize failed"); + map = memalign(hpage_len, hpage_len); if (!map) ksft_exit_fail_msg("memalign failed\n"); diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index b8558c7f1a39..0e74635c8c3d 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -300,6 +300,10 @@ int main(int argc, char **argv) pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; pmd_pagesize = read_pmd_pagesize(); + if (!pmd_pagesize) { + printf("Reading PMD pagesize failed\n"); + exit(EXIT_FAILURE); + } split_pmd_thp(); split_pte_mapped_thp(); diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 40e795624ff3..8dc74dd022c2 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -84,12 +84,12 @@ uint64_t read_pmd_pagesize(void) fd = open(PMD_SIZE_FILE_PATH, O_RDONLY); if (fd == -1) - ksft_exit_fail_msg("Open hpage_pmd_size failed\n"); + return 0; num_read = read(fd, buf, 19); if (num_read < 1) { close(fd); - ksft_exit_fail_msg("Read hpage_pmd_size failed\n"); + return 0; } buf[num_read] = '\0'; close(fd); -- cgit v1.2.3 From 9eac40fc0cc7b5bbc405af3b94cbe8aeb680ab26 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 11 Apr 2023 16:25:08 +0200 Subject: selftests/mm: mkdirty: test behavior of (pte|pmd)_mkdirty on VMAs without write permissions Let's add some tests that trigger (pte|pmd)_mkdirty on VMAs without write permissions. If an architecture implementation is wrong, we might accidentally set the PTE/PMD writable and allow for write access in a VMA without write permissions. The tests include reproducers for the two issues recently discovered and worked-around in core-MM for now: (1) commit 624a2c94f5b7 ("Partly revert "mm/thp: carry over dirty bit when thp splits on pmd"") (2) commit 96a9c287e25d ("mm/migrate: fix wrongly apply write bit after mkdirty on sparc64") In addition, some other tests that reveal further issues. All tests pass under x86_64: ./mkdirty # [INFO] detected THP size: 2048 KiB TAP version 13 1..6 # [INFO] PTRACE write access ok 1 SIGSEGV generated, page not modified # [INFO] PTRACE write access to THP ok 2 SIGSEGV generated, page not modified # [INFO] Page migration ok 3 SIGSEGV generated, page not modified # [INFO] Page migration of THP ok 4 SIGSEGV generated, page not modified # [INFO] PTE-mapping a THP ok 5 SIGSEGV generated, page not modified # [INFO] UFFDIO_COPY ok 6 SIGSEGV generated, page not modified # Totals: pass:6 fail:0 xfail:0 xpass:0 skip:0 error:0 But some fail on sparc64: ./mkdirty # [INFO] detected THP size: 8192 KiB TAP version 13 1..6 # [INFO] PTRACE write access not ok 1 SIGSEGV generated, page not modified # [INFO] PTRACE write access to THP not ok 2 SIGSEGV generated, page not modified # [INFO] Page migration ok 3 SIGSEGV generated, page not modified # [INFO] Page migration of THP ok 4 SIGSEGV generated, page not modified # [INFO] PTE-mapping a THP ok 5 SIGSEGV generated, page not modified # [INFO] UFFDIO_COPY not ok 6 SIGSEGV generated, page not modified Bail out! 3 out of 6 tests failed # Totals: pass:3 fail:3 xfail:0 xpass:0 skip:0 error:0 Reverting both above commits makes all tests fail on sparc64: ./mkdirty # [INFO] detected THP size: 8192 KiB TAP version 13 1..6 # [INFO] PTRACE write access not ok 1 SIGSEGV generated, page not modified # [INFO] PTRACE write access to THP not ok 2 SIGSEGV generated, page not modified # [INFO] Page migration not ok 3 SIGSEGV generated, page not modified # [INFO] Page migration of THP not ok 4 SIGSEGV generated, page not modified # [INFO] PTE-mapping a THP not ok 5 SIGSEGV generated, page not modified # [INFO] UFFDIO_COPY not ok 6 SIGSEGV generated, page not modified Bail out! 6 out of 6 tests failed # Totals: pass:0 fail:6 xfail:0 xpass:0 skip:0 error:0 The tests are useful to detect other problematic archs, to verify new arch fixes, and to stop such issues from reappearing in the future. For now, we don't add any hugetlb tests. Link: https://lkml.kernel.org/r/20230411142512.438404-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: David S. Miller Cc: Hugh Dickins Cc: Peter Xu Cc: Sam Ravnborg Cc: Shuah Khan Cc: Yu Zhao Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 + tools/testing/selftests/mm/mkdirty.c | 379 +++++++++++++++++++++++++++++++++++ 2 files changed, 381 insertions(+) create mode 100644 tools/testing/selftests/mm/mkdirty.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index c31d952cff68..8235dddbbbc6 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -47,6 +47,7 @@ TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate TEST_GEN_FILES += memfd_secret TEST_GEN_FILES += migration +TEST_GEN_PROGS += mkdirty TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += mrelease_test @@ -108,6 +109,7 @@ $(OUTPUT)/cow: vm_util.c $(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/ksm_functional_tests: vm_util.c $(OUTPUT)/madv_populate: vm_util.c +$(OUTPUT)/mkdirty: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c $(OUTPUT)/userfaultfd: vm_util.c diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c new file mode 100644 index 000000000000..6d71d972997b --- /dev/null +++ b/tools/testing/selftests/mm/mkdirty.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test handling of code that might set PTE/PMD dirty in read-only VMAs. + * Setting a PTE/PMD dirty must not accidentally set the PTE/PMD writable. + * + * Copyright 2023, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +static size_t pagesize; +static size_t thpsize; +static int mem_fd; +static int pagemap_fd; +static sigjmp_buf env; + +static void signal_handler(int sig) +{ + if (sig == SIGSEGV) + siglongjmp(env, 1); + siglongjmp(env, 2); +} + +static void do_test_write_sigsegv(char *mem) +{ + char orig = *mem; + int ret; + + if (signal(SIGSEGV, signal_handler) == SIG_ERR) { + ksft_test_result_fail("signal() failed\n"); + return; + } + + ret = sigsetjmp(env, 1); + if (!ret) + *mem = orig + 1; + + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) + ksft_test_result_fail("signal() failed\n"); + + ksft_test_result(ret == 1 && *mem == orig, + "SIGSEGV generated, page not modified\n"); +} + +static char *mmap_thp_range(int prot, char **_mmap_mem, size_t *_mmap_size) +{ + const size_t mmap_size = 2 * thpsize; + char *mem, *mmap_mem; + + mmap_mem = mmap(NULL, mmap_size, prot, MAP_PRIVATE|MAP_ANON, + -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return MAP_FAILED; + } + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + + if (madvise(mem, thpsize, MADV_HUGEPAGE)) { + ksft_test_result_skip("MADV_HUGEPAGE failed\n"); + munmap(mmap_mem, mmap_size); + return MAP_FAILED; + } + + *_mmap_mem = mmap_mem; + *_mmap_size = mmap_size; + return mem; +} + +static void test_ptrace_write(void) +{ + char data = 1; + char *mem; + int ret; + + ksft_print_msg("[INFO] PTRACE write access\n"); + + mem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + /* Fault in the shared zeropage. */ + if (*mem != 0) { + ksft_test_result_fail("Memory not zero\n"); + goto munmap; + } + + /* + * Unshare the page (populating a fresh anon page that might be set + * dirty in the PTE) in the read-only VMA using ptrace (FOLL_FORCE). + */ + lseek(mem_fd, (uintptr_t) mem, SEEK_SET); + ret = write(mem_fd, &data, 1); + if (ret != 1 || *mem != data) { + ksft_test_result_fail("write() failed\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mem, pagesize); +} + +static void test_ptrace_write_thp(void) +{ + char *mem, *mmap_mem; + size_t mmap_size; + char data = 1; + int ret; + + ksft_print_msg("[INFO] PTRACE write access to THP\n"); + + mem = mmap_thp_range(PROT_READ, &mmap_mem, &mmap_size); + if (mem == MAP_FAILED) + return; + + /* + * Write to the first subpage in the read-only VMA using + * ptrace(FOLL_FORCE), eventually placing a fresh THP that is marked + * dirty in the PMD. + */ + lseek(mem_fd, (uintptr_t) mem, SEEK_SET); + ret = write(mem_fd, &data, 1); + if (ret != 1 || *mem != data) { + ksft_test_result_fail("write() failed\n"); + goto munmap; + } + + /* MM populated a THP if we got the last subpage populated as well. */ + if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mmap_mem, mmap_size); +} + +static void test_page_migration(void) +{ + char *mem; + + ksft_print_msg("[INFO] Page migration\n"); + + mem = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, + -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + /* Populate a fresh page and dirty it. */ + memset(mem, 1, pagesize); + if (mprotect(mem, pagesize, PROT_READ)) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + + /* Trigger page migration. Might not be available or fail. */ + if (syscall(__NR_mbind, mem, pagesize, MPOL_LOCAL, NULL, 0x7fful, + MPOL_MF_MOVE)) { + ksft_test_result_skip("mbind() failed\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mem, pagesize); +} + +static void test_page_migration_thp(void) +{ + char *mem, *mmap_mem; + size_t mmap_size; + + ksft_print_msg("[INFO] Page migration of THP\n"); + + mem = mmap_thp_range(PROT_READ|PROT_WRITE, &mmap_mem, &mmap_size); + if (mem == MAP_FAILED) + return; + + /* + * Write to the first page, which might populate a fresh anon THP + * and dirty it. + */ + memset(mem, 1, pagesize); + if (mprotect(mem, thpsize, PROT_READ)) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + + /* MM populated a THP if we got the last subpage populated as well. */ + if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + + /* Trigger page migration. Might not be available or fail. */ + if (syscall(__NR_mbind, mem, thpsize, MPOL_LOCAL, NULL, 0x7fful, + MPOL_MF_MOVE)) { + ksft_test_result_skip("mbind() failed\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mmap_mem, mmap_size); +} + +static void test_pte_mapped_thp(void) +{ + char *mem, *mmap_mem; + size_t mmap_size; + + ksft_print_msg("[INFO] PTE-mapping a THP\n"); + + mem = mmap_thp_range(PROT_READ|PROT_WRITE, &mmap_mem, &mmap_size); + if (mem == MAP_FAILED) + return; + + /* + * Write to the first page, which might populate a fresh anon THP + * and dirty it. + */ + memset(mem, 1, pagesize); + if (mprotect(mem, thpsize, PROT_READ)) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + + /* MM populated a THP if we got the last subpage populated as well. */ + if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + + /* Trigger PTE-mapping the THP by mprotect'ing the last subpage. */ + if (mprotect(mem + thpsize - pagesize, pagesize, + PROT_READ|PROT_WRITE)) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mmap_mem, mmap_size); +} + +#ifdef __NR_userfaultfd +static void test_uffdio_copy(void) +{ + struct uffdio_register uffdio_register; + struct uffdio_copy uffdio_copy; + struct uffdio_api uffdio_api; + char *dst, *src; + int uffd; + + ksft_print_msg("[INFO] UFFDIO_COPY\n"); + + src = malloc(pagesize); + memset(src, 1, pagesize); + dst = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1, 0); + if (dst == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + ksft_test_result_skip("__NR_userfaultfd failed\n"); + goto munmap; + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { + ksft_test_result_fail("UFFDIO_API failed\n"); + goto close_uffd; + } + + uffdio_register.range.start = (unsigned long) dst; + uffdio_register.range.len = pagesize; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + ksft_test_result_fail("UFFDIO_REGISTER failed\n"); + goto close_uffd; + } + + /* Place a page in a read-only VMA, which might set the PTE dirty. */ + uffdio_copy.dst = (unsigned long) dst; + uffdio_copy.src = (unsigned long) src; + uffdio_copy.len = pagesize; + uffdio_copy.mode = 0; + if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) { + ksft_test_result_fail("UFFDIO_COPY failed\n"); + goto close_uffd; + } + + do_test_write_sigsegv(dst); +close_uffd: + close(uffd); +munmap: + munmap(dst, pagesize); + free(src); +#endif /* __NR_userfaultfd */ +} + +int main(void) +{ + int err, tests = 2; + + pagesize = getpagesize(); + thpsize = read_pmd_pagesize(); + if (thpsize) { + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", + thpsize / 1024); + tests += 3; + } +#ifdef __NR_userfaultfd + tests += 1; +#endif /* __NR_userfaultfd */ + + ksft_print_header(); + ksft_set_plan(tests); + + mem_fd = open("/proc/self/mem", O_RDWR); + if (mem_fd < 0) + ksft_exit_fail_msg("opening /proc/self/mem failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening /proc/self/pagemap failed\n"); + + /* + * On some ptrace(FOLL_FORCE) write access via /proc/self/mem in + * read-only VMAs, the kernel may set the PTE/PMD dirty. + */ + test_ptrace_write(); + if (thpsize) + test_ptrace_write_thp(); + /* + * On page migration, the kernel may set the PTE/PMD dirty when + * remapping the page. + */ + test_page_migration(); + if (thpsize) + test_page_migration_thp(); + /* PTE-mapping a THP might propagate the dirty PMD bit to the PTEs. */ + if (thpsize) + test_pte_mapped_thp(); + /* Placing a fresh page via userfaultfd may set the PTE dirty. */ +#ifdef __NR_userfaultfd + test_uffdio_copy(); +#endif /* __NR_userfaultfd */ + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} -- cgit v1.2.3 From c14ef37871fcee9dbcaebb1bc73ac4da21547c13 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:41:14 -0400 Subject: selftests/mm: update .gitignore with two missing tests Patch series "selftests/mm: Split / Refactor userfault test", v2. This patchset splits userfaultfd.c into two tests: - uffd-stress: the "vanilla", old and powerful stress test - uffd-unit-tests: all the unit tests will be moved here This is on my todo list for a long time but I never did it for real. The uffd test is growing into a small and cute monster. I start to notice it's going harder to maintain such a test and make it useful. A few issues I found when looking at userfaultfd test: - We have a bunch of unit tests in userfaultfd.c, but they always need to be run only after a stress type. No way to not do it. - We can only run an unit test for one memory type only, if we want to do a quick smoke test to check regressions, there's no good way. The best to come currently is "bash ./run_vmtests.sh -t userfaultfd" thanks to the most recent changes to run_vmtests.sh on tagging. Still, that needs to run the stress tests always and hard to see what's wrong. - It's hard to add a new unit test to userfaultfd.c, we don't really know what's happening, not until we mostly read the whole file. - We did a bunch of useless tests, e.g. we run twice the whole suite of stress test just to verify both syscall and /dev/userfaultfd. They're all using userfaultfd_new() to create the handle, everything should really be the same underneath. One simple unit test should cover that! - We have tens of global variables in one file but shared with all the tests. Some of them are not suitable to be a global var from maintainance pov. It enforces every unit test to consider how these vars affects the stress test and vice versa, but that's logically not necessary. - Userfaultfd test is not friendly to old kernels. Mostly it only works on the latest kernel tree. It's preferrable to be run on all kernels and properly report what's missing. I'll stop here, I feel like I can still list some.. This patchset should resolve all issues above, and actually we can do even more on top. I stopped doing that until I found I already got 29 patches and 2000+ LOC changes. That's already a patchset terrible enough so we should move in small steps. After the whole set applied, "./run_vmtests.sh -t userfaultfd" looks like this: ===8<=== vm.nr_hugepages = 1024 ------------------------- running ./uffd-unit-tests ------------------------- Testing UFFDIO_API (with syscall)... done Testing UFFDIO_API (with /dev/userfaultfd)... done Testing register-ioctls on anon... done Testing register-ioctls on shmem... done Testing register-ioctls on shmem-private... done Testing register-ioctls on hugetlb... done Testing register-ioctls on hugetlb-private... done Testing zeropage on anon... done Testing zeropage on shmem... done Testing zeropage on shmem-private... done Testing zeropage on hugetlb... done Testing zeropage on hugetlb-private... done Testing pagemap on anon... done Testing wp-unpopulated on anon... done Testing minor on shmem... done Testing minor on hugetlb... done Testing minor-wp on shmem... done Testing minor-wp on hugetlb... done Testing minor-collapse on shmem... done Testing sigbus on anon... done Testing sigbus on shmem... done Testing sigbus on shmem-private... done Testing sigbus on hugetlb... done Testing sigbus on hugetlb-private... done Testing sigbus-wp on anon... done Testing sigbus-wp on shmem... done Testing sigbus-wp on shmem-private... done Testing sigbus-wp on hugetlb... done Testing sigbus-wp on hugetlb-private... done Testing events on anon... done Testing events on shmem... done Testing events on shmem-private... done Testing events on hugetlb... done Testing events on hugetlb-private... done Testing events-wp on anon... done Testing events-wp on shmem... done Testing events-wp on shmem-private... done Testing events-wp on hugetlb... done Testing events-wp on hugetlb-private... done Userfaults unit tests: pass=39, skip=0, fail=0 (total=39) [PASS] -------------------------------- running ./uffd-stress anon 20 16 -------------------------------- nr_pages: 5120, nr_pages_per_cpu: 640 bounces: 15, mode: rnd racing ver poll, userfaults: 345 missing (26+48+61+102+30+12+59+7) 1596 wp (120+139+317+346+215+67+306+86) [...] [PASS] ------------------------------------ running ./uffd-stress hugetlb 128 32 ------------------------------------ nr_pages: 64, nr_pages_per_cpu: 8 bounces: 31, mode: rnd racing ver poll, userfaults: 29 missing (6+6+6+5+4+2+0+0) 104 wp (20+19+22+18+7+12+5+1) [...] [PASS] -------------------------------------------- running ./uffd-stress hugetlb-private 128 32 -------------------------------------------- nr_pages: 64, nr_pages_per_cpu: 8 bounces: 31, mode: rnd racing ver poll, userfaults: 33 missing (12+9+7+0+5+0+0+0) 111 wp (24+25+14+14+11+17+5+1) [...] [PASS] --------------------------------- running ./uffd-stress shmem 20 16 --------------------------------- nr_pages: 5120, nr_pages_per_cpu: 640 bounces: 15, mode: rnd racing ver poll, userfaults: 247 missing (15+17+34+60+81+37+3+0) 2038 wp (180+114+276+400+381+318+165+204) [...] [PASS] ----------------------------------------- running ./uffd-stress shmem-private 20 16 ----------------------------------------- nr_pages: 5120, nr_pages_per_cpu: 640 bounces: 15, mode: rnd racing ver poll, userfaults: 235 missing (52+29+55+56+13+9+16+5) 2849 wp (218+406+461+531+328+284+430+191) [...] [PASS] SUMMARY: PASS=6 SKIP=0 FAIL=0 ===8<=== The output may be different if we miss some features (e.g., hugetlb not allocated, old kernel, less privilege of uffd handle), but they should show up with good reasons. E.g., I tried to run the unit test on my Fedora kernel and it gives me: ===8<=== UFFDIO_API (with syscall)... failed [reason: UFFDIO_API should fail with wrong api but didn't] UFFDIO_API (with /dev/userfaultfd)... skipped [reason: cannot open userfaultfd handle] zeropage on anon... done zeropage on shmem... done zeropage on shmem-private... done zeropage-hugetlb on hugetlb... done zeropage-hugetlb on hugetlb-private... done pagemap on anon... pagemap on anon... pagemap on anon... done wp-unpopulated on anon... skipped [reason: feature missing] minor on shmem... done minor on hugetlb... done minor-wp on shmem... skipped [reason: feature missing] minor-wp on hugetlb... skipped [reason: feature missing] minor-collapse on shmem... done sigbus on anon... skipped [reason: possible lack of priviledge] sigbus on shmem... skipped [reason: possible lack of priviledge] sigbus on shmem-private... skipped [reason: possible lack of priviledge] sigbus on hugetlb... skipped [reason: possible lack of priviledge] sigbus on hugetlb-private... skipped [reason: possible lack of priviledge] sigbus-wp on anon... skipped [reason: possible lack of priviledge] sigbus-wp on shmem... skipped [reason: possible lack of priviledge] sigbus-wp on shmem-private... skipped [reason: possible lack of priviledge] sigbus-wp on hugetlb... skipped [reason: possible lack of priviledge] sigbus-wp on hugetlb-private... skipped [reason: possible lack of priviledge] events on anon... skipped [reason: possible lack of priviledge] events on shmem... skipped [reason: possible lack of priviledge] events on shmem-private... skipped [reason: possible lack of priviledge] events on hugetlb... skipped [reason: possible lack of priviledge] events on hugetlb-private... skipped [reason: possible lack of priviledge] events-wp on anon... skipped [reason: possible lack of priviledge] events-wp on shmem... skipped [reason: possible lack of priviledge] events-wp on shmem-private... skipped [reason: possible lack of priviledge] events-wp on hugetlb... skipped [reason: possible lack of priviledge] events-wp on hugetlb-private... skipped [reason: possible lack of priviledge] Userfaults unit tests: pass=9, skip=24, fail=1 (total=34) ===8<=== Patch layout: - Revert "userfaultfd: don't fail on unrecognized features" Something I found when I got the UFFDIO_API test below. Axel, I still propose to revert it as a whole, but feel free to continue the discussion from the original patch thread. - selftests/mm: Update .gitignore with two missing tests - selftests/mm: Dump a summary in run_vmtests.sh - selftests/mm: Merge util.h into vm_util.h - selftests/mm: Use TEST_GEN_PROGS where proper - selftests/mm: Link vm_util.c always - selftests/mm: Merge default_huge_page_size() into one - selftests/mm: Use PM_* macros in vm_utils.h - selftests/mm: Reuse pagemap_get_entry() in vm_util.h - selftests/mm: Test UFFDIO_ZEROPAGE only when !hugetlb - selftests/mm: Drop test_uffdio_zeropage_eexist Until here, all cleanups here and there. I wanted to keep going, but I found that maybe it'll take a few more days to split the test. Hence I did a split starting from the next one, so we have a working thing first. - selftests/mm: Create uffd-common.[ch] - selftests/mm: Split uffd tests into uffd-stress and uffd-unit-tests This did the major brute force split of common codes into uffd-common.[ch]. That'll be the so far common base for stress and unit tests. Then a new unit test is created. - selftests/mm: uffd_[un]register() - selftests/mm: uffd_open_{dev|sys}() - selftests/mm: UFFDIO_API test This patch hides here to start writting the 1st unit test with UFFDIO_API, also detection of userfaultfd privileges. - selftests/mm: Drop global mem_fd in uffd tests - selftests/mm: Drop global hpage_size in uffd tests - selftests/mm: Rename uffd_stats to uffd_args - selftests/mm: Let uffd_handle_page_fault() takes wp parameter - selftests/mm: Allow allocate_area() to fail properly Some further cleanup that I noticed otherwise hard to move the tests. - selftests/mm: Add framework for uffd-unit-test The major patch provides the framework for most of the rest unit tests. - selftests/mm: Move uffd pagemap test to unit test - selftests/mm: Move uffd minor test to unit test - selftests/mm: Move uffd sig/events tests into uffd unit tests - selftests/mm: Move zeropage test into uffd unit tests Move unit tests and suite them into the new file. - selftests/mm: Workaround no way to detect uffd-minor + wp - selftests/mm: Allow uffd test to skip properly with no privilege - selftests/mm: Drop sys/dev test in uffd-stress test - selftests/mm: Add shmem-private test to uffd-stress A bunch of changes to do better on error reportings, and add shmem-private to the stress test which was long missing. - selftests/mm: Add uffdio register ioctls test One more patch to test uffdio_register.ioctls. This patch (of 30): Update .gitignore with two missing tests. Link: https://lkml.kernel.org/r/20230412163922.327282-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20230412164114.327709-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Axel Rasmussen Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 1f8c36a9fa10..347277f2adc3 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -36,3 +36,5 @@ split_huge_page_test ksm_tests local_config.h local_config.mk +ksm_functional_tests +mdwe_test -- cgit v1.2.3 From c7c55fc4e39aed6a5a4d47f2201e53b1efb24ca6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:41:17 -0400 Subject: selftests/mm: dump a summary in run_vmtests.sh Dump a summary after running whatever test specified. Useful for human runners to identify any kind of failures (besides exit code). Link: https://lkml.kernel.org/r/20230412164117.327720-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index c0f93b668c0c..ddf40f883747 100644 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -5,6 +5,9 @@ # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 +count_pass=0 +count_fail=0 +count_skip=0 exitcode=0 usage() { @@ -149,11 +152,14 @@ run_test() { "$@" local ret=$? if [ $ret -eq 0 ]; then + count_pass=$(( count_pass + 1 )) echo "[PASS]" elif [ $ret -eq $ksft_skip ]; then + count_skip=$(( count_skip + 1 )) echo "[SKIP]" exitcode=$ksft_skip else + count_fail=$(( count_fail + 1 )) echo "[FAIL]" exitcode=1 fi @@ -279,4 +285,6 @@ CATEGORY="soft_dirty" run_test ./soft-dirty # COW tests CATEGORY="cow" run_test ./cow +echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}" + exit $exitcode -- cgit v1.2.3 From af605d26a8f26e9e46fa82246dc1241efd3834a5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:41:20 -0400 Subject: selftests/mm: merge util.h into vm_util.h There're two util headers under mm/ kselftest. Merge one with another. It turns out util.h is the easy one to move. When merging, drop PAGE_SIZE / PAGE_SHIFT because they're unnecessary wrappers to page_size() / page_shift(), meanwhile rename them to psize() and pshift() so as to not conflict with some existing definitions in some test files that includes vm_util.h. Link: https://lkml.kernel.org/r/20230412164120.327731-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 ++ tools/testing/selftests/mm/gup_test.c | 5 +- tools/testing/selftests/mm/ksm_tests.c | 2 +- tools/testing/selftests/mm/mrelease_test.c | 11 ++--- tools/testing/selftests/mm/transhuge-stress.c | 12 ++--- tools/testing/selftests/mm/util.h | 69 --------------------------- tools/testing/selftests/mm/vm_util.c | 31 ++++++++++++ tools/testing/selftests/mm/vm_util.h | 31 ++++++++++++ 8 files changed, 80 insertions(+), 85 deletions(-) delete mode 100644 tools/testing/selftests/mm/util.h (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 8235dddbbbc6..9d9822b4bf24 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -113,6 +113,10 @@ $(OUTPUT)/mkdirty: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c $(OUTPUT)/userfaultfd: vm_util.c +$(OUTPUT)/gup_test: vm_util.c +$(OUTPUT)/mrelease_test: vm_util.c +$(OUTPUT)/transhuge-stress: vm_util.c +$(OUTPUT)/ksm_tests: vm_util.c ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index e43879291dac..ec2229136384 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -12,8 +12,7 @@ #include #include #include "../kselftest.h" - -#include "util.h" +#include "vm_util.h" #define MB (1UL << 20) @@ -251,7 +250,7 @@ int main(int argc, char **argv) if (touch) { gup.gup_flags |= FOLL_TOUCH; } else { - for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) + for (; (unsigned long)p < gup.addr + size; p += psize()) p[0] = 0; } diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c index f9eb4d67e0dd..74281593a124 100644 --- a/tools/testing/selftests/mm/ksm_tests.c +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -12,7 +12,7 @@ #include "../kselftest.h" #include -#include "util.h" +#include "vm_util.h" #define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" #define KSM_FP(s) (KSM_SYSFS_PATH s) diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c index 6c62966ab5db..37b6d33b9e84 100644 --- a/tools/testing/selftests/mm/mrelease_test.c +++ b/tools/testing/selftests/mm/mrelease_test.c @@ -9,8 +9,7 @@ #include #include #include - -#include "util.h" +#include "vm_util.h" #include "../kselftest.h" @@ -32,7 +31,7 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd) unsigned long i; char *buf; - buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, + buf = (char *)mmap(NULL, nr_pages * psize(), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, 0, 0); if (buf == MAP_FAILED) { perror("mmap failed, halting the test"); @@ -40,7 +39,7 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd) } for (i = 0; i < nr_pages; i++) - *((unsigned long *)(buf + (i * PAGE_SIZE))) = i; + *((unsigned long *)(buf + (i * psize()))) = i; /* Signal the parent that the child is ready */ if (write(pipefd, "", 1) < 0) { @@ -54,7 +53,7 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd) timeout--; } - munmap(buf, nr_pages * PAGE_SIZE); + munmap(buf, nr_pages * psize()); return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; } @@ -87,7 +86,7 @@ static int child_main(int pipefd[], size_t size) /* Allocate and fault-in memory and wait to be killed */ close(pipefd[0]); - res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]); + res = alloc_noexit(MB(size) / psize(), pipefd[1]); close(pipefd[1]); return res; } diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c index e3f00adb1b82..ba9d37ad3a89 100644 --- a/tools/testing/selftests/mm/transhuge-stress.c +++ b/tools/testing/selftests/mm/transhuge-stress.c @@ -15,7 +15,7 @@ #include #include #include -#include "util.h" +#include "vm_util.h" int backing_fd = -1; int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; @@ -34,10 +34,10 @@ int main(int argc, char **argv) int pagemap_fd; ram = sysconf(_SC_PHYS_PAGES); - if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4) + if (ram > SIZE_MAX / psize() / 4) ram = SIZE_MAX / 4; else - ram *= sysconf(_SC_PAGESIZE); + ram *= psize(); len = ram; while (++i < argc) { @@ -58,7 +58,7 @@ int main(int argc, char **argv) warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, - ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); + ram >> (20 + HPAGE_SHIFT - pshift() - 1)); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) @@ -92,7 +92,7 @@ int main(int argc, char **argv) if (pfn < 0) { nr_failed++; } else { - size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT); + size_t idx = pfn >> (HPAGE_SHIFT - pshift()); nr_succeed++; if (idx >= map_len) { @@ -108,7 +108,7 @@ int main(int argc, char **argv) } /* split transhuge page, keep last page */ - if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED)) + if (madvise(p, HPAGE_SIZE - psize(), MADV_DONTNEED)) err(2, "MADV_DONTNEED"); } clock_gettime(CLOCK_MONOTONIC, &b); diff --git a/tools/testing/selftests/mm/util.h b/tools/testing/selftests/mm/util.h deleted file mode 100644 index b27d26199334..000000000000 --- a/tools/testing/selftests/mm/util.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __KSELFTEST_VM_UTIL_H -#define __KSELFTEST_VM_UTIL_H - -#include -#include -#include -#include /* ffsl() */ -#include /* _SC_PAGESIZE */ - -static unsigned int __page_size; -static unsigned int __page_shift; - -static inline unsigned int page_size(void) -{ - if (!__page_size) - __page_size = sysconf(_SC_PAGESIZE); - return __page_size; -} - -static inline unsigned int page_shift(void) -{ - if (!__page_shift) - __page_shift = (ffsl(page_size()) - 1); - return __page_shift; -} - -#define PAGE_SHIFT (page_shift()) -#define PAGE_SIZE (page_size()) -/* - * On ppc64 this will only work with radix 2M hugepage size - */ -#define HPAGE_SHIFT 21 -#define HPAGE_SIZE (1 << HPAGE_SHIFT) - -#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) -#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) - - -static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd) -{ - uint64_t ent[2]; - - /* drop pmd */ - if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_ANONYMOUS | - MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) - errx(2, "mmap transhuge"); - - if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - /* allocate transparent huge page */ - *(volatile void **)ptr = ptr; - - if (pread(pagemap_fd, ent, sizeof(ent), - (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) - err(2, "read pagemap"); - - if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && - PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && - !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) - return PAGEMAP_PFN(ent[0]); - - return -1; -} - -#endif diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 8dc74dd022c2..85411ee7ff8b 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -8,6 +8,9 @@ #define SMAP_FILE_PATH "/proc/self/smaps" #define MAX_LINE_LENGTH 500 +unsigned int __page_size; +unsigned int __page_shift; + uint64_t pagemap_get_entry(int fd, char *start) { const unsigned long pfn = (unsigned long)start / getpagesize(); @@ -149,3 +152,31 @@ bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) { return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); } + +int64_t allocate_transhuge(void *ptr, int pagemap_fd) +{ + uint64_t ent[2]; + + /* drop pmd */ + if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | + MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) + errx(2, "mmap transhuge"); + + if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + /* allocate transparent huge page */ + *(volatile void **)ptr = ptr; + + if (pread(pagemap_fd, ent, sizeof(ent), + (uintptr_t)ptr >> (pshift() - 3)) != sizeof(ent)) + err(2, "read pagemap"); + + if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && + PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && + !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - pshift())) - 1))) + return PAGEMAP_PFN(ent[0]); + + return -1; +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 1995ee911ef2..6edeb531afc6 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -1,6 +1,27 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include +#include +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +extern unsigned int __page_size; +extern unsigned int __page_shift; + +static inline unsigned int psize(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned int pshift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(psize()) - 1); + return __page_shift; +} uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); @@ -13,3 +34,13 @@ uint64_t read_pmd_pagesize(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); +int64_t allocate_transhuge(void *ptr, int pagemap_fd); + +/* + * On ppc64 this will only work with radix 2M hugepage size + */ +#define HPAGE_SHIFT 21 +#define HPAGE_SIZE (1 << HPAGE_SHIFT) + +#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) +#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) -- cgit v1.2.3 From aef6fde75d8c6c1cad4a0e017a8d4cbee2143723 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:18 -0400 Subject: selftests/mm: use TEST_GEN_PROGS where proper TEST_GEN_PROGS and TEST_GEN_FILES are used randomly in the mm/Makefile to specify programs that need to build. Logically all these binaries should all fall into TEST_GEN_PROGS. Replace those TEST_GEN_FILES with TEST_GEN_PROGS, so that we can reference all the tests easily later. [peterx@redhat.com: tools/testing/selftests/mm/Makefile: don't wipe out TEST_GEN_PROGS] Link: https://lkml.kernel.org/r/ZDxrvZh/cw357D8P@x1n Link: https://lkml.kernel.org/r/20230412164218.328104-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Axel Rasmussen Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 65 +++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 9d9822b4bf24..fa3ad6ca5859 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -31,35 +31,36 @@ MAKEFLAGS += --no-builtin-rules CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) LDLIBS = -lrt -lpthread -TEST_GEN_FILES = cow -TEST_GEN_FILES += compaction_test -TEST_GEN_FILES += gup_test -TEST_GEN_FILES += hmm-tests -TEST_GEN_FILES += hugetlb-madvise -TEST_GEN_FILES += hugepage-mmap -TEST_GEN_FILES += hugepage-mremap -TEST_GEN_FILES += hugepage-shm -TEST_GEN_FILES += hugepage-vmemmap -TEST_GEN_FILES += khugepaged -TEST_GEN_PROGS = madv_populate -TEST_GEN_FILES += map_fixed_noreplace -TEST_GEN_FILES += map_hugetlb -TEST_GEN_FILES += map_populate -TEST_GEN_FILES += memfd_secret -TEST_GEN_FILES += migration + +TEST_GEN_PROGS = cow +TEST_GEN_PROGS += compaction_test +TEST_GEN_PROGS += gup_test +TEST_GEN_PROGS += hmm-tests +TEST_GEN_PROGS += hugetlb-madvise +TEST_GEN_PROGS += hugepage-mmap +TEST_GEN_PROGS += hugepage-mremap +TEST_GEN_PROGS += hugepage-shm +TEST_GEN_PROGS += hugepage-vmemmap +TEST_GEN_PROGS += khugepaged +TEST_GEN_PROGS += madv_populate +TEST_GEN_PROGS += map_fixed_noreplace +TEST_GEN_PROGS += map_hugetlb +TEST_GEN_PROGS += map_populate +TEST_GEN_PROGS += memfd_secret +TEST_GEN_PROGS += migration TEST_GEN_PROGS += mkdirty -TEST_GEN_FILES += mlock-random-test -TEST_GEN_FILES += mlock2-tests -TEST_GEN_FILES += mrelease_test -TEST_GEN_FILES += mremap_dontunmap -TEST_GEN_FILES += mremap_test -TEST_GEN_FILES += on-fault-limit -TEST_GEN_FILES += thuge-gen -TEST_GEN_FILES += transhuge-stress -TEST_GEN_FILES += userfaultfd +TEST_GEN_PROGS += mlock-random-test +TEST_GEN_PROGS += mlock2-tests +TEST_GEN_PROGS += mrelease_test +TEST_GEN_PROGS += mremap_dontunmap +TEST_GEN_PROGS += mremap_test +TEST_GEN_PROGS += on-fault-limit +TEST_GEN_PROGS += thuge-gen +TEST_GEN_PROGS += transhuge-stress +TEST_GEN_PROGS += userfaultfd TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test -TEST_GEN_FILES += ksm_tests +TEST_GEN_PROGS += ksm_tests TEST_GEN_PROGS += ksm_functional_tests TEST_GEN_PROGS += mdwe_test @@ -77,24 +78,24 @@ CFLAGS += -no-pie endif ifeq ($(CAN_BUILD_I386),1) -TEST_GEN_FILES += $(BINARIES_32) +TEST_GEN_PROGS += $(BINARIES_32) endif ifeq ($(CAN_BUILD_X86_64),1) -TEST_GEN_FILES += $(BINARIES_64) +TEST_GEN_PROGS += $(BINARIES_64) endif else ifneq (,$(findstring $(MACHINE),ppc64)) -TEST_GEN_FILES += protection_keys +TEST_GEN_PROGS += protection_keys endif endif ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) -TEST_GEN_FILES += va_128TBswitch -TEST_GEN_FILES += virtual_address_range -TEST_GEN_FILES += write_to_hugetlbfs +TEST_GEN_PROGS += va_128TBswitch +TEST_GEN_PROGS += virtual_address_range +TEST_GEN_PROGS += write_to_hugetlbfs endif TEST_PROGS := run_vmtests.sh -- cgit v1.2.3 From 4b54f5a758b7ac521d4658cb0d18b132b87597c0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:20 -0400 Subject: selftests/mm: link vm_util.c always We do have plenty of files that want to link against vm_util.c. Just make it simple by linking it always. Link: https://lkml.kernel.org/r/20230412164220.328123-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Axel Rasmussen Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index fa3ad6ca5859..b2a8cac79899 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -106,18 +106,7 @@ TEST_FILES += va_128TBswitch.sh include ../lib.mk -$(OUTPUT)/cow: vm_util.c -$(OUTPUT)/khugepaged: vm_util.c -$(OUTPUT)/ksm_functional_tests: vm_util.c -$(OUTPUT)/madv_populate: vm_util.c -$(OUTPUT)/mkdirty: vm_util.c -$(OUTPUT)/soft-dirty: vm_util.c -$(OUTPUT)/split_huge_page_test: vm_util.c -$(OUTPUT)/userfaultfd: vm_util.c -$(OUTPUT)/gup_test: vm_util.c -$(OUTPUT)/mrelease_test: vm_util.c -$(OUTPUT)/transhuge-stress: vm_util.c -$(OUTPUT)/ksm_tests: vm_util.c +$(TEST_GEN_PROGS): vm_util.c ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) -- cgit v1.2.3 From bd4d67e76f699688d15e6194f1505b8bdfee0dd7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:23 -0400 Subject: selftests/mm: merge default_huge_page_size() into one There're already 3 same definitions of the three functions. Move it into vm_util.[ch]. Link: https://lkml.kernel.org/r/20230412164223.328134-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb-madvise.c | 25 +------------------------ tools/testing/selftests/mm/thuge-gen.c | 19 +------------------ tools/testing/selftests/mm/userfaultfd.c | 24 ------------------------ tools/testing/selftests/mm/vm_util.c | 21 +++++++++++++++++++++ tools/testing/selftests/mm/vm_util.h | 1 + 5 files changed, 24 insertions(+), 66 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index 9a127a8fe176..28426e30d9bc 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -18,6 +18,7 @@ #include #include #include +#include "vm_util.h" #define MIN_FREE_PAGES 20 #define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ @@ -35,30 +36,6 @@ unsigned long huge_page_size; unsigned long base_page_size; -/* - * default_huge_page_size copied from mlock2-tests.c - */ -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - unsigned long get_free_hugepages(void) { unsigned long fhp = 0; diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 361ef7192cc6..380ab5f0a534 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -24,6 +24,7 @@ #include #include #include +#include "vm_util.h" #define err(x) perror(x), exit(1) @@ -74,24 +75,6 @@ void find_pagesizes(void) globfree(&g); } -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - free(line); - return hps; -} - void show(unsigned long ps) { char buf[100]; diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index a96d126cb40e..4cc80a0e8955 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -1703,30 +1703,6 @@ static int userfaultfd_stress(void) || userfaultfd_events_test() || userfaultfd_minor_test(); } -/* - * Copied from mlock2-tests.c - */ -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - static void set_test_type(const char *type) { if (!strcmp(type, "anon")) { diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 85411ee7ff8b..7ffad87aa7e8 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -180,3 +180,24 @@ int64_t allocate_transhuge(void *ptr, int pagemap_fd) return -1; } + +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 6edeb531afc6..d7163fff8fb7 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -35,6 +35,7 @@ bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); int64_t allocate_transhuge(void *ptr, int pagemap_fd); +unsigned long default_huge_page_size(void); /* * On ppc64 this will only work with radix 2M hugepage size -- cgit v1.2.3 From 9f74696bd23da71ab19b1825f813421904cd4169 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:27 -0400 Subject: selftests/mm: use PM_* macros in vm_utils.h We've got the macros in uffd-stress.c, move it over and use it in vm_util.h. Link: https://lkml.kernel.org/r/20230412164227.328145-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/userfaultfd.c | 8 -------- tools/testing/selftests/mm/vm_util.c | 16 ++++------------ tools/testing/selftests/mm/vm_util.h | 8 ++++++++ 3 files changed, 12 insertions(+), 20 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index 4cc80a0e8955..7e841f7e2884 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -1389,14 +1389,6 @@ static int userfaultfd_minor_test(void) return stats.missing_faults != 0 || stats.minor_faults != nr_pages; } -#define BIT_ULL(nr) (1ULL << (nr)) -#define PM_SOFT_DIRTY BIT_ULL(55) -#define PM_MMAP_EXCLUSIVE BIT_ULL(56) -#define PM_UFFD_WP BIT_ULL(57) -#define PM_FILE BIT_ULL(61) -#define PM_SWAP BIT_ULL(62) -#define PM_PRESENT BIT_ULL(63) - static int pagemap_open(void) { int fd = open("/proc/self/pagemap", O_RDONLY); diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 7ffad87aa7e8..54d227d6f70a 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -25,25 +25,17 @@ uint64_t pagemap_get_entry(int fd, char *start) bool pagemap_is_softdirty(int fd, char *start) { - uint64_t entry = pagemap_get_entry(fd, start); - - // Check if dirty bit (55th bit) is set - return entry & 0x0080000000000000ull; + return pagemap_get_entry(fd, start) & PM_SOFT_DIRTY; } bool pagemap_is_swapped(int fd, char *start) { - uint64_t entry = pagemap_get_entry(fd, start); - - return entry & 0x4000000000000000ull; + return pagemap_get_entry(fd, start) & PM_SWAP; } bool pagemap_is_populated(int fd, char *start) { - uint64_t entry = pagemap_get_entry(fd, start); - - /* Present or swapped. */ - return entry & 0xc000000000000000ull; + return pagemap_get_entry(fd, start) & (PM_PRESENT | PM_SWAP); } unsigned long pagemap_get_pfn(int fd, char *start) @@ -51,7 +43,7 @@ unsigned long pagemap_get_pfn(int fd, char *start) uint64_t entry = pagemap_get_entry(fd, start); /* If present (63th bit), PFN is at bit 0 -- 54. */ - if (entry & 0x8000000000000000ull) + if (entry & PM_PRESENT) return entry & 0x007fffffffffffffull; return -1ul; } diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index d7163fff8fb7..d9fadddb5c69 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -6,6 +6,14 @@ #include /* ffsl() */ #include /* _SC_PAGESIZE */ +#define BIT_ULL(nr) (1ULL << (nr)) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + extern unsigned int __page_size; extern unsigned int __page_shift; -- cgit v1.2.3 From 366e93c46576e77f1ccfdeab31127f40537e7484 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:31 -0400 Subject: selftests/mm: reuse pagemap_get_entry() in vm_util.h Meanwhile drop pagemap_read_vaddr(). Link: https://lkml.kernel.org/r/20230412164231.328157-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/userfaultfd.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index 7e841f7e2884..795fbc4d84f8 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -1399,19 +1399,6 @@ static int pagemap_open(void) return fd; } -static uint64_t pagemap_read_vaddr(int fd, void *vaddr) -{ - uint64_t value; - int ret; - - ret = pread(fd, &value, sizeof(uint64_t), - ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); - if (ret != sizeof(uint64_t)) - err("pread() on pagemap failed"); - - return value; -} - /* This macro let __LINE__ works in err() */ #define pagemap_check_wp(value, wp) do { \ if (!!(value & PM_UFFD_WP) != wp) \ @@ -1427,7 +1414,7 @@ static int pagemap_test_fork(bool present) if (!child) { /* Open the pagemap fd of the child itself */ fd = pagemap_open(); - value = pagemap_read_vaddr(fd, area_dst); + value = pagemap_get_entry(fd, area_dst); /* * After fork() uffd-wp bit should be gone as long as we're * without UFFD_FEATURE_EVENT_FORK @@ -1446,24 +1433,24 @@ static void userfaultfd_wp_unpopulated_test(int pagemap_fd) /* Test applying pte marker to anon unpopulated */ wp_range(uffd, (uint64_t)area_dst, page_size, true); - value = pagemap_read_vaddr(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, true); /* Test unprotect on anon pte marker */ wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_read_vaddr(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, false); /* Test zap on anon marker */ wp_range(uffd, (uint64_t)area_dst, page_size, true); if (madvise(area_dst, page_size, MADV_DONTNEED)) err("madvise(MADV_DONTNEED) failed"); - value = pagemap_read_vaddr(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, false); /* Test fault in after marker removed */ *area_dst = 1; - value = pagemap_read_vaddr(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, false); /* Drop it to make pte none again */ if (madvise(area_dst, page_size, MADV_DONTNEED)) @@ -1522,7 +1509,7 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) /* Touch the page */ *area_dst = 1; wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); - value = pagemap_read_vaddr(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, true); /* Make sure uffd-wp bit dropped when fork */ if (pagemap_test_fork(true)) @@ -1536,7 +1523,7 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) err("madvise(MADV_PAGEOUT) failed"); /* Uffd-wp should persist even swapped out */ - value = pagemap_read_vaddr(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, true); /* Make sure uffd-wp bit dropped when fork */ if (pagemap_test_fork(false)) @@ -1544,12 +1531,12 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) /* Unprotect; this tests swap pte modifications */ wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_read_vaddr(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, false); /* Fault in the page from disk */ *area_dst = 2; - value = pagemap_read_vaddr(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, false); close(pagemap_fd); -- cgit v1.2.3 From 4af9ff29816ac011af143c9d5cd16ab75429a600 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:34 -0400 Subject: selftests/mm: test UFFDIO_ZEROPAGE only when !hugetlb Make the check as simple as "test_type == TEST_HUGETLB" because that's the only mem that doesn't support ZEROPAGE. Link: https://lkml.kernel.org/r/20230412164234.328168-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index 795fbc4d84f8..d724f1c78847 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -1118,7 +1118,7 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) { struct uffdio_zeropage uffdio_zeropage; int ret; - bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); + bool has_zeropage = !(test_type == TEST_HUGETLB); __s64 res; if (offset >= nr_pages * page_size) -- cgit v1.2.3 From 618aeb5d6255a7c2db2ac9cb850e5e044dd916c5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:38 -0400 Subject: selftests/mm: drop test_uffdio_zeropage_eexist The idea was trying to flip this var in the alarm handler from time to time to test -EEXIST of UFFDIO_ZEROPAGE, but firstly it's only used in the zeropage test so probably only used once, meanwhile we passed "retry==false" so it'll never got tested anyway. Drop both sides so we always test UFFDIO_ZEROPAGE retries if has_zeropage is set (!hugetlb). One more thing to do is doing UFFDIO_REGISTER for the alias buffer too, because otherwise the test won't even pass! We were just lucky that this test never really got ran at all. Link: https://lkml.kernel.org/r/20230412164238.328238-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Axel Rasmussen Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/userfaultfd.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index d724f1c78847..3487ec0bfcc8 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -88,7 +88,6 @@ static bool test_dev_userfaultfd; /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ #define ALARM_INTERVAL_SECS 10 static volatile bool test_uffdio_copy_eexist = true; -static volatile bool test_uffdio_zeropage_eexist = true; /* Whether to test uffd write-protection */ static bool test_uffdio_wp = true; /* Whether to test uffd minor faults */ @@ -1114,7 +1113,7 @@ static void retry_uffdio_zeropage(int ufd, } } -static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) +static int __uffdio_zeropage(int ufd, unsigned long offset) { struct uffdio_zeropage uffdio_zeropage; int ret; @@ -1138,11 +1137,8 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) if (res != page_size) { err("UFFDIO_ZEROPAGE unexpected size"); } else { - if (test_uffdio_zeropage_eexist && retry) { - test_uffdio_zeropage_eexist = false; - retry_uffdio_zeropage(ufd, &uffdio_zeropage, - offset); - } + retry_uffdio_zeropage(ufd, &uffdio_zeropage, + offset); return 1; } } else @@ -1153,7 +1149,7 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) static int uffdio_zeropage(int ufd, unsigned long offset) { - return __uffdio_zeropage(ufd, offset, false); + return __uffdio_zeropage(ufd, offset); } /* exercise UFFDIO_ZEROPAGE */ @@ -1177,6 +1173,13 @@ static int userfaultfd_zeropage_test(void) assert_expected_ioctls_present( uffdio_register.mode, uffdio_register.ioctls); + if (area_dst_alias) { + /* Needed this to test zeropage-retry on shared memory */ + uffdio_register.range.start = (unsigned long) area_dst_alias; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + } + if (uffdio_zeropage(uffd, 0)) if (my_bcmp(area_dst, zeropage, page_size)) err("zeropage is not zero"); @@ -1763,7 +1766,6 @@ static void sigalrm(int sig) if (sig != SIGALRM) abort(); test_uffdio_copy_eexist = true; - test_uffdio_zeropage_eexist = true; alarm(ALARM_INTERVAL_SECS); } -- cgit v1.2.3 From 33be4e892877eec7c8eb32988b95f5c42c87f0f0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:41 -0400 Subject: selftests/mm: create uffd-common.[ch] Move common utility functions into uffd-common.[ch] files from the original userfaultfd.c. This prepares for a split of userfaultfd.c into two tests: one to only cover the old but powerful stress test, the other one covers all the functional tests. This movement is kind of a brute-force effort for now, with light touch-ups but nothing should really change. There's chances to optimize more, but let's leave that for later. Link: https://lkml.kernel.org/r/20230412164241.328259-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 + tools/testing/selftests/mm/uffd-common.c | 611 +++++++++++++++++++++++++++ tools/testing/selftests/mm/uffd-common.h | 117 ++++++ tools/testing/selftests/mm/userfaultfd.c | 694 +------------------------------ 4 files changed, 731 insertions(+), 693 deletions(-) create mode 100644 tools/testing/selftests/mm/uffd-common.c create mode 100644 tools/testing/selftests/mm/uffd-common.h (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index b2a8cac79899..210da78ec495 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -108,6 +108,8 @@ include ../lib.mk $(TEST_GEN_PROGS): vm_util.c +$(OUTPUT)/userfaultfd: uffd-common.c + ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c new file mode 100644 index 000000000000..c57757c2a36f --- /dev/null +++ b/tools/testing/selftests/mm/uffd-common.c @@ -0,0 +1,611 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Userfaultfd tests util functions + * + * Copyright (C) 2015-2023 Red Hat, Inc. + */ + +#include "uffd-common.h" + +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + +volatile bool test_uffdio_copy_eexist = true; +unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; +char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; +int mem_fd, uffd = -1, uffd_flags, finished, *pipefd, test_type; +bool map_shared, test_collapse, test_dev_userfaultfd; +bool test_uffdio_wp = true, test_uffdio_minor = false; +unsigned long long *count_verify; +uffd_test_ops_t *uffd_test_ops; + +static void anon_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); +} + +static void anon_allocate_area(void **alloc_area, bool is_src) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +} + +static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ +} + +static void hugetlb_release_pages(char *rel_area) +{ + if (!map_shared) { + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + } else { + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); + } +} + +static void hugetlb_allocate_area(void **alloc_area, bool is_src) +{ + off_t size = nr_pages * page_size; + off_t offset = is_src ? 0 : size; + void *area_alias = NULL; + char **alloc_area_alias; + + *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, + (map_shared ? MAP_SHARED : MAP_PRIVATE) | + (is_src ? 0 : MAP_NORESERVE), + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of hugetlbfs file failed"); + + if (map_shared) { + area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of hugetlb file alias failed"); + } + + if (is_src) { + alloc_area_alias = &area_src_alias; + } else { + alloc_area_alias = &area_dst_alias; + } + if (area_alias) + *alloc_area_alias = area_alias; +} + +static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + if (!map_shared) + return; + + *start = (unsigned long) area_dst_alias + offset; +} + +static void shmem_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); +} + +static void shmem_allocate_area(void **alloc_area, bool is_src) +{ + void *area_alias = NULL; + size_t bytes = nr_pages * page_size; + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; + + if (test_collapse) { + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + } + + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of memfd failed"); + if (test_collapse && *alloc_area != p) + err("mmap of memfd failed at %p", p); + + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of memfd alias failed"); + if (test_collapse && area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); + + if (is_src) + area_src_alias = area_alias; + else + area_dst_alias = area_alias; +} + +static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + *start = (unsigned long)area_dst_alias + offset; +} + +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + +struct uffd_test_ops anon_uffd_test_ops = { + .allocate_area = anon_allocate_area, + .release_pages = anon_release_pages, + .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, +}; + +struct uffd_test_ops shmem_uffd_test_ops = { + .allocate_area = shmem_allocate_area, + .release_pages = shmem_release_pages, + .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, +}; + +struct uffd_test_ops hugetlb_uffd_test_ops = { + .allocate_area = hugetlb_allocate_area, + .release_pages = hugetlb_release_pages, + .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, +}; + +void uffd_stats_report(struct uffd_stats *stats, int n_cpus) +{ + int i; + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; + + for (i = 0; i < n_cpus; i++) { + miss_total += stats[i].missing_faults; + wp_total += stats[i].wp_faults; + minor_total += stats[i].minor_faults; + } + + printf("userfaults: "); + if (miss_total) { + printf("%llu missing (", miss_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].missing_faults); + printf("\b) "); + } + if (wp_total) { + printf("%llu wp (", wp_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].wp_faults); + printf("\b) "); + } + if (minor_total) { + printf("%llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].minor_faults); + printf("\b)"); + } + printf("\n"); +} + +static int __userfaultfd_open_dev(void) +{ + int fd, _uffd; + + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (fd < 0) + errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); + + _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); + if (_uffd < 0) + errexit(errno == ENOTTY ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + close(fd); + return _uffd; +} + +void userfaultfd_open(uint64_t *features) +{ + struct uffdio_api uffdio_api; + + if (test_dev_userfaultfd) + uffd = __userfaultfd_open_dev(); + else { + uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); + if (uffd < 0) + errexit(errno == ENOSYS ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + } + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = *features; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + err("UFFDIO_API failed.\nPlease make sure to " + "run with either root or ptrace capability."); + if (uffdio_api.api != UFFD_API) + err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); + + *features = uffdio_api.features; +} + +static inline void munmap_area(void **area) +{ + if (*area) + if (munmap(*area, nr_pages * page_size)) + err("munmap"); + + *area = NULL; +} + +static void uffd_test_ctx_clear(void) +{ + size_t i; + + if (pipefd) { + for (i = 0; i < nr_cpus * 2; ++i) { + if (close(pipefd[i])) + err("close pipefd"); + } + free(pipefd); + pipefd = NULL; + } + + if (count_verify) { + free(count_verify); + count_verify = NULL; + } + + if (uffd != -1) { + if (close(uffd)) + err("close uffd"); + uffd = -1; + } + + munmap_area((void **)&area_src); + munmap_area((void **)&area_src_alias); + munmap_area((void **)&area_dst); + munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); +} + +void uffd_test_ctx_init(uint64_t features) +{ + unsigned long nr, cpu; + + uffd_test_ctx_clear(); + + uffd_test_ops->allocate_area((void **)&area_src, true); + uffd_test_ops->allocate_area((void **)&area_dst, false); + + userfaultfd_open(&features); + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) + err("count_verify"); + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + /* + * In the transition between 255 to 256, powerpc will + * read out of order in my_bcmp and see both bytes as + * zero, so leave a placeholder below always non-zero + * after the count, to avoid my_bcmp to trigger false + * positives. + */ + *(area_count(area_src, nr) + 1) = 1; + } + + /* + * After initialization of area_src, we must explicitly release pages + * for area_dst to make sure it's fully empty. Otherwise we could have + * some area_dst pages be errornously initialized with zero pages, + * hence we could hit memory corruption later in the test. + * + * One example is when THP is globally enabled, above allocate_area() + * calls could have the two areas merged into a single VMA (as they + * will have the same VMA flags so they're mergeable). When we + * initialize the area_src above, it's possible that some part of + * area_dst could have been faulted in via one huge THP that will be + * shared between area_src and area_dst. It could cause some of the + * area_dst won't be trapped by missing userfaults. + * + * This release_pages() will guarantee even if that happened, we'll + * proactively split the thp and drop any accidentally initialized + * pages within area_dst. + */ + uffd_test_ops->release_pages(area_dst); + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) + err("pipefd"); + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) + err("pipe"); +} + +uint64_t get_expected_ioctls(uint64_t mode) +{ + uint64_t ioctls = UFFD_API_RANGE_IOCTLS; + + if (test_type == TEST_HUGETLB) + ioctls &= ~(1 << _UFFDIO_ZEROPAGE); + + if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) + ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); + + if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) + ioctls &= ~(1 << _UFFDIO_CONTINUE); + + return ioctls; +} + +void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) +{ + uint64_t expected = get_expected_ioctls(mode); + uint64_t actual = ioctls & expected; + + if (actual != expected) { + err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, + expected, actual); + } +} + +void wp_range(int ufd, __u64 start, __u64 len, bool wp) +{ + struct uffdio_writeprotect prms; + + /* Write protection page faults */ + prms.range.start = start; + prms.range.len = len; + /* Undo write-protect, do wakeup after that */ + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; + + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) + err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); +} + +static void continue_range(int ufd, __u64 start, __u64 len) +{ + struct uffdio_continue req; + int ret; + + req.range.start = start; + req.range.len = len; + req.mode = 0; + if (test_uffdio_wp) + req.mode |= UFFDIO_CONTINUE_MODE_WP; + + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) + err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, + (uint64_t)start); + + /* + * Error handling within the kernel for continue is subtly different + * from copy or zeropage, so it may be a source of bugs. Trigger an + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. + */ + req.mapped = 0; + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); + if (ret >= 0 || req.mapped != -EEXIST) + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, + ret, (int64_t) req.mapped); +} + +int uffd_read_msg(int ufd, struct uffd_msg *msg) +{ + int ret = read(uffd, msg, sizeof(*msg)); + + if (ret != sizeof(*msg)) { + if (ret < 0) { + if (errno == EAGAIN || errno == EINTR) + return 1; + err("blocking read error"); + } else { + err("short read"); + } + } + + return 0; +} + +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); + + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write protect page faults */ + wp_range(uffd, msg->arg.pagefault.address, page_size, false); + stats->wp_faults++; + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { + uint8_t *area; + int b; + + /* + * Minor page faults + * + * To prove we can modify the original range for testing + * purposes, we're going to bit flip this range before + * continuing. + * + * Note that this requires all minor page fault tests operate on + * area_dst (non-UFFD-registered) and area_dst_alias + * (UFFD-registered). + */ + + area = (uint8_t *)(area_dst + + ((char *)msg->arg.pagefault.address - + area_dst_alias)); + for (b = 0; b < page_size; ++b) + area[b] = ~area[b]; + continue_range(uffd, msg->arg.pagefault.address, page_size); + stats->minor_faults++; + } else { + /* + * Missing page faults. + * + * Here we force a write check for each of the missing mode + * faults. It's guaranteed because the only threads that + * will trigger uffd faults are the locking threads, and + * their first instruction to touch the missing page will + * always be pthread_mutex_lock(). + * + * Note that here we relied on an NPTL glibc impl detail to + * always read the lock type at the entry of the lock op + * (pthread_mutex_t.__data.__type, offset 0x10) before + * doing any locking operations to guarantee that. It's + * actually not good to rely on this impl detail because + * logically a pthread-compatible lib can implement the + * locks without types and we can fail when linking with + * them. However since we used to find bugs with this + * strict check we still keep it around. Hopefully this + * could be a good hint when it fails again. If one day + * it'll break on some other impl of glibc we'll revisit. + */ + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + err("unexpected write fault"); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + if (copy_page(uffd, offset)) + stats->missing_faults++; + } +} + +void *uffd_poll_thread(void *arg) +{ + struct uffd_stats *stats = (struct uffd_stats *)arg; + unsigned long cpu = stats->cpu; + struct pollfd pollfd[2]; + struct uffd_msg msg; + struct uffdio_register uffd_reg; + int ret; + char tmp_chr; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd[cpu*2]; + pollfd[1].events = POLLIN; + + for (;;) { + ret = poll(pollfd, 2, -1); + if (ret <= 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + err("poll error: %d", ret); + } + if (pollfd[1].revents) { + if (!(pollfd[1].revents & POLLIN)) + err("pollfd[1].revents %d", pollfd[1].revents); + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) + err("read pipefd error"); + break; + } + if (!(pollfd[0].revents & POLLIN)) + err("pollfd[0].revents %d", pollfd[0].revents); + if (uffd_read_msg(uffd, &msg)) + continue; + switch (msg.event) { + default: + err("unexpected msg event %u\n", msg.event); + break; + case UFFD_EVENT_PAGEFAULT: + uffd_handle_page_fault(&msg, stats); + break; + case UFFD_EVENT_FORK: + close(uffd); + uffd = msg.arg.fork.ufd; + pollfd[0].fd = uffd; + break; + case UFFD_EVENT_REMOVE: + uffd_reg.range.start = msg.arg.remove.start; + uffd_reg.range.len = msg.arg.remove.end - + msg.arg.remove.start; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + err("remove failure"); + break; + case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ + area_dst = (char *)(unsigned long)msg.arg.remap.to; + break; + } + } + + return NULL; +} + +static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_copy->dst, + uffdio_copy->len, + offset); + if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy->copy != -EEXIST) + err("UFFDIO_COPY retry error: %"PRId64, + (int64_t)uffdio_copy->copy); + } else { + err("UFFDIO_COPY retry unexpected: %"PRId64, + (int64_t)uffdio_copy->copy); + } +} + +static void wake_range(int ufd, unsigned long addr, unsigned long len) +{ + struct uffdio_range uffdio_wake; + + uffdio_wake.start = addr; + uffdio_wake.len = len; + + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) + fprintf(stderr, "error waking %lu\n", + addr), exit(1); +} + +int __copy_page(int ufd, unsigned long offset, bool retry) +{ + struct uffdio_copy uffdio_copy; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu\n", offset); + uffdio_copy.dst = (unsigned long) area_dst + offset; + uffdio_copy.src = (unsigned long) area_src + offset; + uffdio_copy.len = page_size; + if (test_uffdio_wp) + uffdio_copy.mode = UFFDIO_COPY_MODE_WP; + else + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy.copy != -EEXIST) + err("UFFDIO_COPY error: %"PRId64, + (int64_t)uffdio_copy.copy); + wake_range(ufd, uffdio_copy.dst, page_size); + } else if (uffdio_copy.copy != page_size) { + err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); + } else { + if (test_uffdio_copy_eexist && retry) { + test_uffdio_copy_eexist = false; + retry_copy_page(ufd, &uffdio_copy, offset); + } + return 1; + } + return 0; +} + +int copy_page(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, false); +} diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h new file mode 100644 index 000000000000..d9430cfdcb19 --- /dev/null +++ b/tools/testing/selftests/mm/uffd-common.h @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Userfaultfd tests common header + * + * Copyright (C) 2015-2023 Red Hat, Inc. + */ +#ifndef __UFFD_COMMON_H__ +#define __UFFD_COMMON_H__ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) + +#define _err(fmt, ...) \ + do { \ + int ret = errno; \ + fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ + fprintf(stderr, " (errno=%d, @%s:%d)\n", \ + ret, __FILE__, __LINE__); \ + } while (0) + +#define errexit(exitcode, fmt, ...) \ + do { \ + _err(fmt, ##__VA_ARGS__); \ + exit(exitcode); \ + } while (0) + +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) + +/* pthread_mutex_t starts at page offset 0 */ +#define area_mutex(___area, ___nr) \ + ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) +/* + * count is placed in the page after pthread_mutex_t naturally aligned + * to avoid non alignment faults on non-x86 archs. + */ +#define area_count(___area, ___nr) \ + ((volatile unsigned long long *) ((unsigned long) \ + ((___area) + (___nr)*page_size + \ + sizeof(pthread_mutex_t) + \ + sizeof(unsigned long long) - 1) & \ + ~(unsigned long)(sizeof(unsigned long long) \ + - 1))) + +/* Userfaultfd test statistics */ +struct uffd_stats { + int cpu; + unsigned long missing_faults; + unsigned long wp_faults; + unsigned long minor_faults; +}; + +struct uffd_test_ops { + void (*allocate_area)(void **alloc_area, bool is_src); + void (*release_pages)(char *rel_area); + void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); +}; +typedef struct uffd_test_ops uffd_test_ops_t; + +extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; +extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; +extern int mem_fd, uffd, uffd_flags, finished, *pipefd, test_type; +extern bool map_shared, test_collapse, test_dev_userfaultfd; +extern bool test_uffdio_wp, test_uffdio_minor; +extern unsigned long long *count_verify; +extern volatile bool test_uffdio_copy_eexist; + +extern uffd_test_ops_t anon_uffd_test_ops; +extern uffd_test_ops_t shmem_uffd_test_ops; +extern uffd_test_ops_t hugetlb_uffd_test_ops; +extern uffd_test_ops_t *uffd_test_ops; + +void uffd_stats_report(struct uffd_stats *stats, int n_cpus); +void uffd_test_ctx_init(uint64_t features); +void userfaultfd_open(uint64_t *features); +uint64_t get_expected_ioctls(uint64_t mode); +void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls); +int uffd_read_msg(int ufd, struct uffd_msg *msg); +void wp_range(int ufd, __u64 start, __u64 len, bool wp); +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats); +int __copy_page(int ufd, unsigned long offset, bool retry); +int copy_page(int ufd, unsigned long offset); +void *uffd_poll_thread(void *arg); + +#define TEST_ANON 1 +#define TEST_HUGETLB 2 +#define TEST_SHMEM 3 + +#endif diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index 3487ec0bfcc8..c68a9aeefc41 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -34,96 +34,20 @@ * transfer (UFFDIO_COPY). */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" -#include "vm_util.h" +#include "uffd-common.h" #ifdef __NR_userfaultfd -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; - #define BOUNCE_RANDOM (1<<0) #define BOUNCE_RACINGFAULTS (1<<1) #define BOUNCE_VERIFY (1<<2) #define BOUNCE_POLL (1<<3) static int bounces; -#define TEST_ANON 1 -#define TEST_HUGETLB 2 -#define TEST_SHMEM 3 -static int test_type; - -#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) - -#define BASE_PMD_ADDR ((void *)(1UL << 30)) - -/* test using /dev/userfaultfd, instead of userfaultfd(2) */ -static bool test_dev_userfaultfd; - /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ #define ALARM_INTERVAL_SECS 10 -static volatile bool test_uffdio_copy_eexist = true; -/* Whether to test uffd write-protection */ -static bool test_uffdio_wp = true; -/* Whether to test uffd minor faults */ -static bool test_uffdio_minor = false; -static bool map_shared; -static int mem_fd; -static unsigned long long *count_verify; -static int uffd = -1; -static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; static char *zeropage; pthread_attr_t attr; -static bool test_collapse; - -/* Userfaultfd test statistics */ -struct uffd_stats { - int cpu; - unsigned long missing_faults; - unsigned long wp_faults; - unsigned long minor_faults; -}; - -/* pthread_mutex_t starts at page offset 0 */ -#define area_mutex(___area, ___nr) \ - ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) -/* - * count is placed in the page after pthread_mutex_t naturally aligned - * to avoid non alignment faults on non-x86 archs. - */ -#define area_count(___area, ___nr) \ - ((volatile unsigned long long *) ((unsigned long) \ - ((___area) + (___nr)*page_size + \ - sizeof(pthread_mutex_t) + \ - sizeof(unsigned long long) - 1) & \ - ~(unsigned long)(sizeof(unsigned long long) \ - - 1))) #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) @@ -166,22 +90,6 @@ static void usage(void) exit(1); } -#define _err(fmt, ...) \ - do { \ - int ret = errno; \ - fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ - fprintf(stderr, " (errno=%d, line=%d)\n", \ - ret, __LINE__); \ - } while (0) - -#define errexit(exitcode, fmt, ...) \ - do { \ - _err(fmt, ##__VA_ARGS__); \ - exit(exitcode); \ - } while (0) - -#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) - static void uffd_stats_reset(struct uffd_stats *uffd_stats, unsigned long n_cpus) { @@ -195,189 +103,6 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats, } } -static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) -{ - int i; - unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; - - for (i = 0; i < n_cpus; i++) { - miss_total += stats[i].missing_faults; - wp_total += stats[i].wp_faults; - minor_total += stats[i].minor_faults; - } - - printf("userfaults: "); - if (miss_total) { - printf("%llu missing (", miss_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].missing_faults); - printf("\b) "); - } - if (wp_total) { - printf("%llu wp (", wp_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].wp_faults); - printf("\b) "); - } - if (minor_total) { - printf("%llu minor (", minor_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].minor_faults); - printf("\b)"); - } - printf("\n"); -} - -static void anon_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); -} - -static void anon_allocate_area(void **alloc_area, bool is_src) -{ - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); -} - -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ -} - -static void hugetlb_release_pages(char *rel_area) -{ - if (!map_shared) { - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - } else { - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); - } -} - -static void hugetlb_allocate_area(void **alloc_area, bool is_src) -{ - off_t size = nr_pages * page_size; - off_t offset = is_src ? 0 : size; - void *area_alias = NULL; - char **alloc_area_alias; - - *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, - (map_shared ? MAP_SHARED : MAP_PRIVATE) | - (is_src ? 0 : MAP_NORESERVE), - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of hugetlbfs file failed"); - - if (map_shared) { - area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED, mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of hugetlb file alias failed"); - } - - if (is_src) { - alloc_area_alias = &area_src_alias; - } else { - alloc_area_alias = &area_dst_alias; - } - if (area_alias) - *alloc_area_alias = area_alias; -} - -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - if (!map_shared) - return; - - *start = (unsigned long) area_dst_alias + offset; -} - -static void shmem_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); -} - -static void shmem_allocate_area(void **alloc_area, bool is_src) -{ - void *area_alias = NULL; - size_t bytes = nr_pages * page_size; - unsigned long offset = is_src ? 0 : bytes; - char *p = NULL, *p_alias = NULL; - - if (test_collapse) { - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); - p_alias = p; - p_alias += bytes; - p_alias += hpage_size; /* Prevent src/dst VMA merge */ - } - - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of memfd failed"); - if (test_collapse && *alloc_area != p) - err("mmap of memfd failed at %p", p); - - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of memfd alias failed"); - if (test_collapse && area_alias != p_alias) - err("mmap of anonymous memory failed at %p", p_alias); - - if (is_src) - area_src_alias = area_alias; - else - area_dst_alias = area_alias; -} - -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - *start = (unsigned long)area_dst_alias + offset; -} - -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) -{ - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) - err("Did not find expected %d number of hugepages", - expect_nr_hpages); -} - -struct uffd_test_ops { - void (*allocate_area)(void **alloc_area, bool is_src); - void (*release_pages)(char *rel_area); - void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); - void (*check_pmd_mapping)(void *p, int expect_nr_hpages); -}; - -static struct uffd_test_ops anon_uffd_test_ops = { - .allocate_area = anon_allocate_area, - .release_pages = anon_release_pages, - .alias_mapping = noop_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops shmem_uffd_test_ops = { - .allocate_area = shmem_allocate_area, - .release_pages = shmem_release_pages, - .alias_mapping = shmem_alias_mapping, - .check_pmd_mapping = shmem_check_pmd_mapping, -}; - -static struct uffd_test_ops hugetlb_uffd_test_ops = { - .allocate_area = hugetlb_allocate_area, - .release_pages = hugetlb_release_pages, - .alias_mapping = hugetlb_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops *uffd_test_ops; - static inline uint64_t uffd_minor_feature(void) { if (test_type == TEST_HUGETLB && map_shared) @@ -388,171 +113,6 @@ static inline uint64_t uffd_minor_feature(void) return 0; } -static uint64_t get_expected_ioctls(uint64_t mode) -{ - uint64_t ioctls = UFFD_API_RANGE_IOCTLS; - - if (test_type == TEST_HUGETLB) - ioctls &= ~(1 << _UFFDIO_ZEROPAGE); - - if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) - ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); - - if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) - ioctls &= ~(1 << _UFFDIO_CONTINUE); - - return ioctls; -} - -static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) -{ - uint64_t expected = get_expected_ioctls(mode); - uint64_t actual = ioctls & expected; - - if (actual != expected) { - err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, - expected, actual); - } -} - -static int __userfaultfd_open_dev(void) -{ - int fd, _uffd; - - fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); - if (fd < 0) - errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); - - _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); - if (_uffd < 0) - errexit(errno == ENOTTY ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - close(fd); - return _uffd; -} - -static void userfaultfd_open(uint64_t *features) -{ - struct uffdio_api uffdio_api; - - if (test_dev_userfaultfd) - uffd = __userfaultfd_open_dev(); - else { - uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); - if (uffd < 0) - errexit(errno == ENOSYS ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - } - uffd_flags = fcntl(uffd, F_GETFD, NULL); - - uffdio_api.api = UFFD_API; - uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) - err("UFFDIO_API failed.\nPlease make sure to " - "run with either root or ptrace capability."); - if (uffdio_api.api != UFFD_API) - err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); - - *features = uffdio_api.features; -} - -static inline void munmap_area(void **area) -{ - if (*area) - if (munmap(*area, nr_pages * page_size)) - err("munmap"); - - *area = NULL; -} - -static void uffd_test_ctx_clear(void) -{ - size_t i; - - if (pipefd) { - for (i = 0; i < nr_cpus * 2; ++i) { - if (close(pipefd[i])) - err("close pipefd"); - } - free(pipefd); - pipefd = NULL; - } - - if (count_verify) { - free(count_verify); - count_verify = NULL; - } - - if (uffd != -1) { - if (close(uffd)) - err("close uffd"); - uffd = -1; - } - - munmap_area((void **)&area_src); - munmap_area((void **)&area_src_alias); - munmap_area((void **)&area_dst); - munmap_area((void **)&area_dst_alias); - munmap_area((void **)&area_remap); -} - -static void uffd_test_ctx_init(uint64_t features) -{ - unsigned long nr, cpu; - - uffd_test_ctx_clear(); - - uffd_test_ops->allocate_area((void **)&area_src, true); - uffd_test_ops->allocate_area((void **)&area_dst, false); - - userfaultfd_open(&features); - - count_verify = malloc(nr_pages * sizeof(unsigned long long)); - if (!count_verify) - err("count_verify"); - - for (nr = 0; nr < nr_pages; nr++) { - *area_mutex(area_src, nr) = - (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; - count_verify[nr] = *area_count(area_src, nr) = 1; - /* - * In the transition between 255 to 256, powerpc will - * read out of order in my_bcmp and see both bytes as - * zero, so leave a placeholder below always non-zero - * after the count, to avoid my_bcmp to trigger false - * positives. - */ - *(area_count(area_src, nr) + 1) = 1; - } - - /* - * After initialization of area_src, we must explicitly release pages - * for area_dst to make sure it's fully empty. Otherwise we could have - * some area_dst pages be errornously initialized with zero pages, - * hence we could hit memory corruption later in the test. - * - * One example is when THP is globally enabled, above allocate_area() - * calls could have the two areas merged into a single VMA (as they - * will have the same VMA flags so they're mergeable). When we - * initialize the area_src above, it's possible that some part of - * area_dst could have been faulted in via one huge THP that will be - * shared between area_src and area_dst. It could cause some of the - * area_dst won't be trapped by missing userfaults. - * - * This release_pages() will guarantee even if that happened, we'll - * proactively split the thp and drop any accidentally initialized - * pages within area_dst. - */ - uffd_test_ops->release_pages(area_dst); - - pipefd = malloc(sizeof(int) * nr_cpus * 2); - if (!pipefd) - err("pipefd"); - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) - err("pipe"); -} - static int my_bcmp(char *str1, char *str2, size_t n) { unsigned long i; @@ -562,47 +122,6 @@ static int my_bcmp(char *str1, char *str2, size_t n) return 0; } -static void wp_range(int ufd, __u64 start, __u64 len, bool wp) -{ - struct uffdio_writeprotect prms; - - /* Write protection page faults */ - prms.range.start = start; - prms.range.len = len; - /* Undo write-protect, do wakeup after that */ - prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; - - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) - err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); -} - -static void continue_range(int ufd, __u64 start, __u64 len) -{ - struct uffdio_continue req; - int ret; - - req.range.start = start; - req.range.len = len; - req.mode = 0; - if (test_uffdio_wp) - req.mode |= UFFDIO_CONTINUE_MODE_WP; - - if (ioctl(ufd, UFFDIO_CONTINUE, &req)) - err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, - (uint64_t)start); - - /* - * Error handling within the kernel for continue is subtly different - * from copy or zeropage, so it may be a source of bugs. Trigger an - * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. - */ - req.mapped = 0; - ret = ioctl(ufd, UFFDIO_CONTINUE, &req); - if (ret >= 0 || req.mapped != -EEXIST) - err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, - ret, (int64_t) req.mapped); -} - static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -635,222 +154,11 @@ static void *locking_thread(void *arg) return NULL; } -static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_copy->dst, - uffdio_copy->len, - offset); - if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy->copy != -EEXIST) - err("UFFDIO_COPY retry error: %"PRId64, - (int64_t)uffdio_copy->copy); - } else { - err("UFFDIO_COPY retry unexpected: %"PRId64, - (int64_t)uffdio_copy->copy); - } -} - -static void wake_range(int ufd, unsigned long addr, unsigned long len) -{ - struct uffdio_range uffdio_wake; - - uffdio_wake.start = addr; - uffdio_wake.len = len; - - if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) - fprintf(stderr, "error waking %lu\n", - addr), exit(1); -} - -static int __copy_page(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_copy uffdio_copy; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu\n", offset); - uffdio_copy.dst = (unsigned long) area_dst + offset; - uffdio_copy.src = (unsigned long) area_src + offset; - uffdio_copy.len = page_size; - if (test_uffdio_wp) - uffdio_copy.mode = UFFDIO_COPY_MODE_WP; - else - uffdio_copy.mode = 0; - uffdio_copy.copy = 0; - if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy.copy != -EEXIST) - err("UFFDIO_COPY error: %"PRId64, - (int64_t)uffdio_copy.copy); - wake_range(ufd, uffdio_copy.dst, page_size); - } else if (uffdio_copy.copy != page_size) { - err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); - } else { - if (test_uffdio_copy_eexist && retry) { - test_uffdio_copy_eexist = false; - retry_copy_page(ufd, &uffdio_copy, offset); - } - return 1; - } - return 0; -} - static int copy_page_retry(int ufd, unsigned long offset) { return __copy_page(ufd, offset, true); } -static int copy_page(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, false); -} - -static int uffd_read_msg(int ufd, struct uffd_msg *msg) -{ - int ret = read(uffd, msg, sizeof(*msg)); - - if (ret != sizeof(*msg)) { - if (ret < 0) { - if (errno == EAGAIN || errno == EINTR) - return 1; - err("blocking read error"); - } else { - err("short read"); - } - } - - return 0; -} - -static void uffd_handle_page_fault(struct uffd_msg *msg, - struct uffd_stats *stats) -{ - unsigned long offset; - - if (msg->event != UFFD_EVENT_PAGEFAULT) - err("unexpected msg event %u", msg->event); - - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { - /* Write protect page faults */ - wp_range(uffd, msg->arg.pagefault.address, page_size, false); - stats->wp_faults++; - } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { - uint8_t *area; - int b; - - /* - * Minor page faults - * - * To prove we can modify the original range for testing - * purposes, we're going to bit flip this range before - * continuing. - * - * Note that this requires all minor page fault tests operate on - * area_dst (non-UFFD-registered) and area_dst_alias - * (UFFD-registered). - */ - - area = (uint8_t *)(area_dst + - ((char *)msg->arg.pagefault.address - - area_dst_alias)); - for (b = 0; b < page_size; ++b) - area[b] = ~area[b]; - continue_range(uffd, msg->arg.pagefault.address, page_size); - stats->minor_faults++; - } else { - /* - * Missing page faults. - * - * Here we force a write check for each of the missing mode - * faults. It's guaranteed because the only threads that - * will trigger uffd faults are the locking threads, and - * their first instruction to touch the missing page will - * always be pthread_mutex_lock(). - * - * Note that here we relied on an NPTL glibc impl detail to - * always read the lock type at the entry of the lock op - * (pthread_mutex_t.__data.__type, offset 0x10) before - * doing any locking operations to guarantee that. It's - * actually not good to rely on this impl detail because - * logically a pthread-compatible lib can implement the - * locks without types and we can fail when linking with - * them. However since we used to find bugs with this - * strict check we still keep it around. Hopefully this - * could be a good hint when it fails again. If one day - * it'll break on some other impl of glibc we'll revisit. - */ - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - err("unexpected write fault"); - - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); - - if (copy_page(uffd, offset)) - stats->missing_faults++; - } -} - -static void *uffd_poll_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - unsigned long cpu = stats->cpu; - struct pollfd pollfd[2]; - struct uffd_msg msg; - struct uffdio_register uffd_reg; - int ret; - char tmp_chr; - - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd[cpu*2]; - pollfd[1].events = POLLIN; - - for (;;) { - ret = poll(pollfd, 2, -1); - if (ret <= 0) { - if (errno == EINTR || errno == EAGAIN) - continue; - err("poll error: %d", ret); - } - if (pollfd[1].revents & POLLIN) { - if (read(pollfd[1].fd, &tmp_chr, 1) != 1) - err("read pipefd error"); - break; - } - if (!(pollfd[0].revents & POLLIN)) - err("pollfd[0].revents %d", pollfd[0].revents); - if (uffd_read_msg(uffd, &msg)) - continue; - switch (msg.event) { - default: - err("unexpected msg event %u\n", msg.event); - break; - case UFFD_EVENT_PAGEFAULT: - uffd_handle_page_fault(&msg, stats); - break; - case UFFD_EVENT_FORK: - close(uffd); - uffd = msg.arg.fork.ufd; - pollfd[0].fd = uffd; - break; - case UFFD_EVENT_REMOVE: - uffd_reg.range.start = msg.arg.remove.start; - uffd_reg.range.len = msg.arg.remove.end - - msg.arg.remove.start; - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) - err("remove failure"); - break; - case UFFD_EVENT_REMAP: - area_remap = area_dst; /* save for later unmap */ - area_dst = (char *)(unsigned long)msg.arg.remap.to; - break; - } - } - - return NULL; -} - pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; static void *uffd_read_thread(void *arg) -- cgit v1.2.3 From 686a8bb723497197022c73807cdd29eb5a1aeec8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:44 -0400 Subject: selftests/mm: split uffd tests into uffd-stress and uffd-unit-tests In many ways it's weird and unwanted to keep all the tests in the same userfaultfd.c at least when still in the current way. For example, it doesn't make much sense to run the stress test for each method we can create an userfaultfd handle (either via syscall or /dev/ node). It's a waste of time running this twice for the whole stress as the stress paths are the same, only the open path is different. It's also just weird to need to manually specify different types of memory to run all unit tests for the userfaultfd interface. We should be able to just run a single program and that should go through all functional uffd tests without running the stress test at all. The stress test was more for torturing and finding race conditions. We don't want to wait for stress to finish just to regress test a functional test. When we start to pile up more things on top of the same file and same functions, things start to go a bit chaos and the code is just harder to maintain too with tons of global variables. This patch creates a new test uffd-unit-tests to keep userfaultfd unit tests in the future, currently empty. Meanwhile rename the old userfaultfd.c test to uffd-stress.c. Link: https://lkml.kernel.org/r/20230412164244.328270-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 3 +- tools/testing/selftests/mm/Makefile | 8 +- tools/testing/selftests/mm/run_vmtests.sh | 10 +- tools/testing/selftests/mm/uffd-stress.c | 1168 ++++++++++++++++++++++++++ tools/testing/selftests/mm/uffd-unit-tests.c | 27 + tools/testing/selftests/mm/userfaultfd.c | 1168 -------------------------- 6 files changed, 1208 insertions(+), 1176 deletions(-) create mode 100644 tools/testing/selftests/mm/uffd-stress.c create mode 100644 tools/testing/selftests/mm/uffd-unit-tests.c delete mode 100644 tools/testing/selftests/mm/userfaultfd.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 347277f2adc3..8917455f4f51 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -21,7 +21,8 @@ protection_keys protection_keys_32 protection_keys_64 madv_populate -userfaultfd +uffd-stress +uffd-unit-tests mlock-intersect-test mlock-random-test virtual_address_range diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 210da78ec495..63c03a6414fc 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -20,7 +20,7 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/p # Avoid accidental wrong builds, due to built-in rules working just a little # bit too well--but not quite as well as required for our situation here. # -# In other words, "make userfaultfd" is supposed to fail to build at all, +# In other words, "make $SOME_TEST" is supposed to fail to build at all, # because this Makefile only supports either "make" (all), or "make /full/path". # However, the built-in rules, if not suppressed, will pick up CFLAGS and the # initial LDLIBS (but not the target-specific LDLIBS, because those are only @@ -57,7 +57,8 @@ TEST_GEN_PROGS += mremap_test TEST_GEN_PROGS += on-fault-limit TEST_GEN_PROGS += thuge-gen TEST_GEN_PROGS += transhuge-stress -TEST_GEN_PROGS += userfaultfd +TEST_GEN_PROGS += uffd-stress +TEST_GEN_PROGS += uffd-unit-tests TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test TEST_GEN_PROGS += ksm_tests @@ -108,7 +109,8 @@ include ../lib.mk $(TEST_GEN_PROGS): vm_util.c -$(OUTPUT)/userfaultfd: uffd-common.c +$(OUTPUT)/uffd-stress: uffd-common.c +$(OUTPUT)/uffd-unit-tests: uffd-common.c ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index ddf40f883747..efe22dc569f0 100644 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -196,14 +196,16 @@ CATEGORY="gup_test" run_test ./gup_test -a # Dump pages 0, 19, and 4096, using pin_user_pages: CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 +CATEGORY="userfaultfd" run_test ./uffd-unit-tests uffd_mods=("" ":dev") +uffd_stress_bin=./uffd-stress for mod in "${uffd_mods[@]}"; do - CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16 + CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon${mod} 20 16 # Hugetlb tests require source and destination huge pages. Pass in half # the size ($half_ufd_size_MB), which is used for *each*. - CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16 + CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb_shared${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem${mod} 20 16 done #cleanup diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c new file mode 100644 index 000000000000..c68a9aeefc41 --- /dev/null +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -0,0 +1,1168 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stress userfaultfd syscall. + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This test allocates two virtual areas and bounces the physical + * memory across the two virtual areas (from area_src to area_dst) + * using userfaultfd. + * + * There are three threads running per CPU: + * + * 1) one per-CPU thread takes a per-page pthread_mutex in a random + * page of the area_dst (while the physical page may still be in + * area_src), and increments a per-page counter in the same page, + * and checks its value against a verification region. + * + * 2) another per-CPU thread handles the userfaults generated by + * thread 1 above. userfaultfd blocking reads or poll() modes are + * exercised interleaved. + * + * 3) one last per-CPU thread transfers the memory in the background + * at maximum bandwidth (if not already transferred by thread + * 2). Each cpu thread takes cares of transferring a portion of the + * area. + * + * When all threads of type 3 completed the transfer, one bounce is + * complete. area_src and area_dst are then swapped. All threads are + * respawned and so the bounce is immediately restarted in the + * opposite direction. + * + * per-CPU threads 1 by triggering userfaults inside + * pthread_mutex_lock will also verify the atomicity of the memory + * transfer (UFFDIO_COPY). + */ + +#include "uffd-common.h" + +#ifdef __NR_userfaultfd + +#define BOUNCE_RANDOM (1<<0) +#define BOUNCE_RACINGFAULTS (1<<1) +#define BOUNCE_VERIFY (1<<2) +#define BOUNCE_POLL (1<<3) +static int bounces; + +/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ +#define ALARM_INTERVAL_SECS 10 +static char *zeropage; +pthread_attr_t attr; + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) + +const char *examples = + "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" + "./userfaultfd anon 100 99999\n\n" + "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" + "./userfaultfd anon:dev 100 99999\n\n" + "# Run share memory test on 1GiB region with 99 bounces:\n" + "./userfaultfd shmem 1000 99\n\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" + "./userfaultfd hugetlb 256 50\n\n" + "# Run the same hugetlb test but using shared file:\n" + "./userfaultfd hugetlb_shared 256 50\n\n" + "# 10MiB-~6GiB 999 bounces anonymous test, " + "continue forever unless an error triggers\n" + "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./userfaultfd " + "[hugetlbfs_file]\n\n"); + fprintf(stderr, "Supported : anon, hugetlb, " + "hugetlb_shared, shmem\n\n"); + fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " + "Supported mods:\n"); + fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); + fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); + fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" + "memory\n"); + fprintf(stderr, "\nExample test mod usage:\n"); + fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); + fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); + + fprintf(stderr, "Examples:\n\n"); + fprintf(stderr, "%s", examples); + exit(1); +} + +static void uffd_stats_reset(struct uffd_stats *uffd_stats, + unsigned long n_cpus) +{ + int i; + + for (i = 0; i < n_cpus; i++) { + uffd_stats[i].cpu = i; + uffd_stats[i].missing_faults = 0; + uffd_stats[i].wp_faults = 0; + uffd_stats[i].minor_faults = 0; + } +} + +static inline uint64_t uffd_minor_feature(void) +{ + if (test_type == TEST_HUGETLB && map_shared) + return UFFD_FEATURE_MINOR_HUGETLBFS; + else if (test_type == TEST_SHMEM) + return UFFD_FEATURE_MINOR_SHMEM; + else + return 0; +} + +static int my_bcmp(char *str1, char *str2, size_t n) +{ + unsigned long i; + for (i = 0; i < n; i++) + if (str1[i] != str2[i]) + return 1; + return 0; +} + +static void *locking_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr; + unsigned long long count; + + if (!(bounces & BOUNCE_RANDOM)) { + page_nr = -bounces; + if (!(bounces & BOUNCE_RACINGFAULTS)) + page_nr += cpu * nr_pages_per_cpu; + } + + while (!finished) { + if (bounces & BOUNCE_RANDOM) { + if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) + err("getrandom failed"); + } else + page_nr += 1; + page_nr %= nr_pages; + pthread_mutex_lock(area_mutex(area_dst, page_nr)); + count = *area_count(area_dst, page_nr); + if (count != count_verify[page_nr]) + err("page_nr %lu memory corruption %llu %llu", + page_nr, count, count_verify[page_nr]); + count++; + *area_count(area_dst, page_nr) = count_verify[page_nr] = count; + pthread_mutex_unlock(area_mutex(area_dst, page_nr)); + } + + return NULL; +} + +static int copy_page_retry(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, true); +} + +pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void *uffd_read_thread(void *arg) +{ + struct uffd_stats *stats = (struct uffd_stats *)arg; + struct uffd_msg msg; + + pthread_mutex_unlock(&uffd_read_mutex); + /* from here cancellation is ok */ + + for (;;) { + if (uffd_read_msg(uffd, &msg)) + continue; + uffd_handle_page_fault(&msg, stats); + } + + return NULL; +} + +static void *background_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr, start_nr, mid_nr, end_nr; + + start_nr = cpu * nr_pages_per_cpu; + end_nr = (cpu+1) * nr_pages_per_cpu; + mid_nr = (start_nr + end_nr) / 2; + + /* Copy the first half of the pages */ + for (page_nr = start_nr; page_nr < mid_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + /* + * If we need to test uffd-wp, set it up now. Then we'll have + * at least the first half of the pages mapped already which + * can be write-protected for testing + */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, + nr_pages_per_cpu * page_size, true); + + /* + * Continue the 2nd half of the page copying, handling write + * protection faults if any + */ + for (page_nr = mid_nr; page_nr < end_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + return NULL; +} + +static int stress(struct uffd_stats *uffd_stats) +{ + unsigned long cpu; + pthread_t locking_threads[nr_cpus]; + pthread_t uffd_threads[nr_cpus]; + pthread_t background_threads[nr_cpus]; + + finished = 0; + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pthread_create(&locking_threads[cpu], &attr, + locking_thread, (void *)cpu)) + return 1; + if (bounces & BOUNCE_POLL) { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_poll_thread, + (void *)&uffd_stats[cpu])) + return 1; + } else { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_read_thread, + (void *)&uffd_stats[cpu])) + return 1; + pthread_mutex_lock(&uffd_read_mutex); + } + if (pthread_create(&background_threads[cpu], &attr, + background_thread, (void *)cpu)) + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(background_threads[cpu], NULL)) + return 1; + + /* + * Be strict and immediately zap area_src, the whole area has + * been transferred already by the background treads. The + * area_src could then be faulted in a racy way by still + * running uffdio_threads reading zeropages after we zapped + * area_src (but they're guaranteed to get -EEXIST from + * UFFDIO_COPY without writing zero pages into area_dst + * because the background threads already completed). + */ + uffd_test_ops->release_pages(area_src); + + finished = 1; + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(locking_threads[cpu], NULL)) + return 1; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + char c; + if (bounces & BOUNCE_POLL) { + if (write(pipefd[cpu*2+1], &c, 1) != 1) + err("pipefd write error"); + if (pthread_join(uffd_threads[cpu], + (void *)&uffd_stats[cpu])) + return 1; + } else { + if (pthread_cancel(uffd_threads[cpu])) + return 1; + if (pthread_join(uffd_threads[cpu], NULL)) + return 1; + } + } + + return 0; +} + +sigjmp_buf jbuf, *sigbuf; + +static void sighndl(int sig, siginfo_t *siginfo, void *ptr) +{ + if (sig == SIGBUS) { + if (sigbuf) + siglongjmp(*sigbuf, 1); + abort(); + } +} + +/* + * For non-cooperative userfaultfd test we fork() a process that will + * generate pagefaults, will mremap the area monitored by the + * userfaultfd and at last this process will release the monitored + * area. + * For the anonymous and shared memory the area is divided into two + * parts, the first part is accessed before mremap, and the second + * part is accessed after mremap. Since hugetlbfs does not support + * mremap, the entire monitored area is accessed in a single pass for + * HUGETLB_TEST. + * The release of the pages currently generates event for shmem and + * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked + * for hugetlb. + * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register + * monitored area, generate pagefaults and test that signal is delivered. + * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 + * test robustness use case - we release monitored area, fork a process + * that will generate pagefaults and verify signal is generated. + * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal + * feature. Using monitor thread, verify no userfault events are generated. + */ +static int faulting_process(int signal_test) +{ + unsigned long nr; + unsigned long long count; + unsigned long split_nr_pages; + unsigned long lastnr; + struct sigaction act; + volatile unsigned long signalled = 0; + + split_nr_pages = (nr_pages + 1) / 2; + + if (signal_test) { + sigbuf = &jbuf; + memset(&act, 0, sizeof(act)); + act.sa_sigaction = sighndl; + act.sa_flags = SA_SIGINFO; + if (sigaction(SIGBUS, &act, 0)) + err("sigaction"); + lastnr = (unsigned long)-1; + } + + for (nr = 0; nr < split_nr_pages; nr++) { + volatile int steps = 1; + unsigned long offset = nr * page_size; + + if (signal_test) { + if (sigsetjmp(*sigbuf, 1) != 0) { + if (steps == 1 && nr == lastnr) + err("Signal repeated"); + + lastnr = nr; + if (signal_test == 1) { + if (steps == 1) { + /* This is a MISSING request */ + steps++; + if (copy_page(uffd, offset)) + signalled++; + } else { + /* This is a WP request */ + assert(steps == 2); + wp_range(uffd, + (__u64)area_dst + + offset, + page_size, false); + } + } else { + signalled++; + continue; + } + } + } + + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + if (signal_test) + return signalled != split_nr_pages; + + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, area_src); + if (area_dst == MAP_FAILED) + err("mremap"); + /* Reset area_src since we just clobbered it */ + area_src = NULL; + + for (; nr < nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + } + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + uffd_test_ops->release_pages(area_dst); + + for (nr = 0; nr < nr_pages; nr++) + if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) + err("nr %lu is not zero", nr); + + return 0; +} + +static void retry_uffdio_zeropage(int ufd, + struct uffdio_zeropage *uffdio_zeropage, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, + uffdio_zeropage->range.len, + offset); + if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { + if (uffdio_zeropage->zeropage != -EEXIST) + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } else { + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } +} + +static int __uffdio_zeropage(int ufd, unsigned long offset) +{ + struct uffdio_zeropage uffdio_zeropage; + int ret; + bool has_zeropage = !(test_type == TEST_HUGETLB); + __s64 res; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu", offset); + uffdio_zeropage.range.start = (unsigned long) area_dst + offset; + uffdio_zeropage.range.len = page_size; + uffdio_zeropage.mode = 0; + ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + res = uffdio_zeropage.zeropage; + if (ret) { + /* real retval in ufdio_zeropage.zeropage */ + if (has_zeropage) + err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); + else if (res != -EINVAL) + err("UFFDIO_ZEROPAGE not -EINVAL"); + } else if (has_zeropage) { + if (res != page_size) { + err("UFFDIO_ZEROPAGE unexpected size"); + } else { + retry_uffdio_zeropage(ufd, &uffdio_zeropage, + offset); + return 1; + } + } else + err("UFFDIO_ZEROPAGE succeeded"); + + return 0; +} + +static int uffdio_zeropage(int ufd, unsigned long offset) +{ + return __uffdio_zeropage(ufd, offset); +} + +/* exercise UFFDIO_ZEROPAGE */ +static int userfaultfd_zeropage_test(void) +{ + struct uffdio_register uffdio_register; + + printf("testing UFFDIO_ZEROPAGE: "); + fflush(stdout); + + uffd_test_ctx_init(0); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (area_dst_alias) { + /* Needed this to test zeropage-retry on shared memory */ + uffdio_register.range.start = (unsigned long) area_dst_alias; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + } + + if (uffdio_zeropage(uffd, 0)) + if (my_bcmp(area_dst, zeropage, page_size)) + err("zeropage is not zero"); + + printf("done.\n"); + return 0; +} + +static int userfaultfd_events_test(void) +{ + struct uffdio_register uffdio_register; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + struct uffd_stats stats = { 0 }; + + printf("testing events (fork, remap, remove): "); + fflush(stdout); + + features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | + UFFD_FEATURE_EVENT_REMOVE; + uffd_test_ctx_init(features); + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(0)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + return 1; + + uffd_stats_report(&stats, 1); + + return stats.missing_faults != nr_pages; +} + +static int userfaultfd_sig_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long userfaults; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + struct uffd_stats stats = { 0 }; + + printf("testing signal delivery: "); + fflush(stdout); + + features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; + uffd_test_ctx_init(features); + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (faulting_process(1)) + err("faulting process failed"); + + uffd_test_ops->release_pages(area_dst); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(2)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, (void **)&userfaults)) + return 1; + + printf("done.\n"); + if (userfaults) + err("Signal test failed, userfaults: %ld", userfaults); + + return userfaults != 0; +} + +void check_memory_contents(char *p) +{ + unsigned long i; + uint8_t expected_byte; + void *expected_page; + + if (posix_memalign(&expected_page, page_size, page_size)) + err("out of memory"); + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, p + (i * page_size), page_size)) + err("unexpected page contents after minor fault"); + } + + free(expected_page); +} + +static int userfaultfd_minor_test(void) +{ + unsigned long p; + struct uffdio_register uffdio_register; + pthread_t uffd_mon; + char c; + struct uffd_stats stats = { 0 }; + + if (!test_uffdio_minor) + return 0; + + printf("testing minor faults: "); + fflush(stdout); + + uffd_test_ctx_init(uffd_minor_feature()); + + uffdio_register.range.start = (unsigned long)area_dst_alias; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + /* + * After registering with UFFD, populate the non-UFFD-registered side of + * the shared mapping. This should *not* trigger any UFFD minor faults. + */ + for (p = 0; p < nr_pages; ++p) { + memset(area_dst + (p * page_size), p % ((uint8_t)-1), + page_size); + } + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a minor + * fault. uffd_poll_thread will resolve the fault by bit-flipping the + * page's contents, and then issuing a CONTINUE ioctl. + */ + check_memory_contents(area_dst_alias); + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + return 1; + + uffd_stats_report(&stats, 1); + + if (test_collapse) { + printf("testing collapse of uffd memory into PMD-mapped THPs:"); + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) + err("madvise(MADV_COLLAPSE)"); + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + hpage_size); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + printf(" done.\n"); + } + + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; +} + +static int pagemap_open(void) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) + err("open pagemap"); + + return fd; +} + +/* This macro let __LINE__ works in err() */ +#define pagemap_check_wp(value, wp) do { \ + if (!!(value & PM_UFFD_WP) != wp) \ + err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ + } while (0) + +static int pagemap_test_fork(bool present) +{ + pid_t child = fork(); + uint64_t value; + int fd, result; + + if (!child) { + /* Open the pagemap fd of the child itself */ + fd = pagemap_open(); + value = pagemap_get_entry(fd, area_dst); + /* + * After fork() uffd-wp bit should be gone as long as we're + * without UFFD_FEATURE_EVENT_FORK + */ + pagemap_check_wp(value, false); + /* Succeed */ + exit(0); + } + waitpid(child, &result, 0); + return result; +} + +static void userfaultfd_wp_unpopulated_test(int pagemap_fd) +{ + uint64_t value; + + /* Test applying pte marker to anon unpopulated */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + + /* Test unprotect on anon pte marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Test zap on anon marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Test fault in after marker removed */ + *area_dst = 1; + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + /* Drop it to make pte none again */ + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + + /* Test read-zero-page upon pte marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + *(volatile char *)area_dst; + /* Drop it to make pte none again */ + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); +} + +static void userfaultfd_pagemap_test(unsigned int test_pgsize) +{ + struct uffdio_register uffdio_register; + int pagemap_fd; + uint64_t value; + + /* Pagemap tests uffd-wp only */ + if (!test_uffdio_wp) + return; + + /* Not enough memory to test this page size */ + if (test_pgsize > nr_pages * page_size) + return; + + printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); + /* Flush so it doesn't flush twice in parent/child later */ + fflush(stdout); + + uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); + + if (test_pgsize > page_size) { + /* This is a thp test */ + if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) + err("madvise(MADV_HUGEPAGE) failed"); + } else if (test_pgsize == page_size) { + /* This is normal page test; force no thp */ + if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) + err("madvise(MADV_NOHUGEPAGE) failed"); + } + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Smoke test WP_UNPOPULATED first when it's still empty */ + if (test_pgsize == page_size) + userfaultfd_wp_unpopulated_test(pagemap_fd); + + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(true)) + err("Detected stall uffd-wp bit in child"); + + /* Exclusive required or PAGEOUT won't work */ + if (!(value & PM_MMAP_EXCLUSIVE)) + err("multiple mapping detected: 0x%"PRIx64, value); + + if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) + err("madvise(MADV_PAGEOUT) failed"); + + /* Uffd-wp should persist even swapped out */ + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(false)) + err("Detected stall uffd-wp bit in child"); + + /* Unprotect; this tests swap pte modifications */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Fault in the page from disk */ + *area_dst = 2; + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + close(pagemap_fd); + printf("done\n"); +} + +static int userfaultfd_stress(void) +{ + void *area; + unsigned long nr; + struct uffdio_register uffdio_register; + struct uffd_stats uffd_stats[nr_cpus]; + + uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); + + if (posix_memalign(&area, page_size, page_size)) + err("out of memory"); + zeropage = area; + bzero(zeropage, page_size); + + pthread_mutex_lock(&uffd_read_mutex); + + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 16*1024*1024); + + while (bounces--) { + printf("bounces: %d, mode:", bounces); + if (bounces & BOUNCE_RANDOM) + printf(" rnd"); + if (bounces & BOUNCE_RACINGFAULTS) + printf(" racing"); + if (bounces & BOUNCE_VERIFY) + printf(" ver"); + if (bounces & BOUNCE_POLL) + printf(" poll"); + else + printf(" read"); + printf(", "); + fflush(stdout); + + if (bounces & BOUNCE_POLL) + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + else + fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); + + /* register */ + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (area_dst_alias) { + uffdio_register.range.start = (unsigned long) + area_dst_alias; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure alias"); + } + + /* + * The madvise done previously isn't enough: some + * uffd_thread could have read userfaults (one of + * those already resolved by the background thread) + * and it may be in the process of calling + * UFFDIO_COPY. UFFDIO_COPY will read the zapped + * area_src and it would map a zero page in it (of + * course such a UFFDIO_COPY is perfectly safe as it'd + * return -EEXIST). The problem comes at the next + * bounce though: that racing UFFDIO_COPY would + * generate zeropages in the area_src, so invalidating + * the previous MADV_DONTNEED. Without this additional + * MADV_DONTNEED those zeropages leftovers in the + * area_src would lead to -EEXIST failure during the + * next bounce, effectively leaving a zeropage in the + * area_dst. + * + * Try to comment this out madvise to see the memory + * corruption being caught pretty quick. + * + * khugepaged is also inhibited to collapse THP after + * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's + * required to MADV_DONTNEED here. + */ + uffd_test_ops->release_pages(area_dst); + + uffd_stats_reset(uffd_stats, nr_cpus); + + /* bounce pass */ + if (stress(uffd_stats)) + return 1; + + /* Clear all the write protections if there is any */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst, + nr_pages * page_size, false); + + /* unregister */ + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) + err("unregister failure"); + if (area_dst_alias) { + uffdio_register.range.start = (unsigned long) area_dst; + if (ioctl(uffd, UFFDIO_UNREGISTER, + &uffdio_register.range)) + err("unregister failure alias"); + } + + /* verification */ + if (bounces & BOUNCE_VERIFY) + for (nr = 0; nr < nr_pages; nr++) + if (*area_count(area_dst, nr) != count_verify[nr]) + err("error area_count %llu %llu %lu\n", + *area_count(area_src, nr), + count_verify[nr], nr); + + /* prepare next bounce */ + swap(area_src, area_dst); + + swap(area_src_alias, area_dst_alias); + + uffd_stats_report(uffd_stats, nr_cpus); + } + + if (test_type == TEST_ANON) { + /* + * shmem/hugetlb won't be able to run since they have different + * behavior on fork() (file-backed memory normally drops ptes + * directly when fork), meanwhile the pagemap test will verify + * pgtable entry of fork()ed child. + */ + userfaultfd_pagemap_test(page_size); + /* + * Hard-code for x86_64 for now for 2M THP, as x86_64 is + * currently the only one that supports uffd-wp + */ + userfaultfd_pagemap_test(page_size * 512); + } + + return userfaultfd_zeropage_test() || userfaultfd_sig_test() + || userfaultfd_events_test() || userfaultfd_minor_test(); +} + +static void set_test_type(const char *type) +{ + if (!strcmp(type, "anon")) { + test_type = TEST_ANON; + uffd_test_ops = &anon_uffd_test_ops; + } else if (!strcmp(type, "hugetlb")) { + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + } else if (!strcmp(type, "hugetlb_shared")) { + map_shared = true; + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + /* Minor faults require shared hugetlb; only enable here. */ + test_uffdio_minor = true; + } else if (!strcmp(type, "shmem")) { + map_shared = true; + test_type = TEST_SHMEM; + uffd_test_ops = &shmem_uffd_test_ops; + test_uffdio_minor = true; + } +} + +static void parse_test_type_arg(const char *raw_type) +{ + char *buf = strdup(raw_type); + uint64_t features = UFFD_API_FEATURES; + + while (buf) { + const char *token = strsep(&buf, ":"); + + if (!test_type) + set_test_type(token); + else if (!strcmp(token, "dev")) + test_dev_userfaultfd = true; + else if (!strcmp(token, "syscall")) + test_dev_userfaultfd = false; + else if (!strcmp(token, "collapse")) + test_collapse = true; + else + err("unrecognized test mod '%s'", token); + } + + if (!test_type) + err("failed to parse test type argument: '%s'", raw_type); + + if (test_collapse && test_type != TEST_SHMEM) + err("Unsupported test: %s", raw_type); + + if (test_type == TEST_HUGETLB) + page_size = hpage_size; + else + page_size = sysconf(_SC_PAGE_SIZE); + + if (!page_size) + err("Unable to determine page size"); + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 + > page_size) + err("Impossible to run this test"); + + /* + * Whether we can test certain features depends not just on test type, + * but also on whether or not this particular kernel supports the + * feature. + */ + + userfaultfd_open(&features); + + test_uffdio_wp = test_uffdio_wp && + (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); + test_uffdio_minor = test_uffdio_minor && + (features & uffd_minor_feature()); + + close(uffd); + uffd = -1; +} + +static void sigalrm(int sig) +{ + if (sig != SIGALRM) + abort(); + test_uffdio_copy_eexist = true; + alarm(ALARM_INTERVAL_SECS); +} + +int main(int argc, char **argv) +{ + size_t bytes; + + if (argc < 4) + usage(); + + if (signal(SIGALRM, sigalrm) == SIG_ERR) + err("failed to arm SIGALRM"); + alarm(ALARM_INTERVAL_SECS); + + hpage_size = default_huge_page_size(); + parse_test_type_arg(argv[1]); + bytes = atol(argv[2]) * 1024 * 1024; + + if (test_collapse && bytes & (hpage_size - 1)) + err("MiB must be multiple of %lu if :collapse mod set", + hpage_size >> 20); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + + if (test_collapse) { + /* nr_cpus must divide (bytes / page_size), otherwise, + * area allocations of (nr_pages * paze_size) won't be a + * multiple of hpage_size, even if bytes is a multiple of + * hpage_size. + * + * This means that nr_cpus must divide (N * (2 << (H-P)) + * where: + * bytes = hpage_size * N + * hpage_size = 2 << H + * page_size = 2 << P + * + * And we want to chose nr_cpus to be the largest value + * satisfying this constraint, not larger than the number + * of online CPUs. Unfortunately, prime factorization of + * N and nr_cpus may be arbitrary, so have to search for it. + * Instead, just use the highest power of 2 dividing both + * nr_cpus and (bytes / page_size). + */ + int x = factor_of_2(nr_cpus); + int y = factor_of_2(bytes / page_size); + + nr_cpus = x < y ? x : y; + } + nr_pages_per_cpu = bytes / page_size / nr_cpus; + if (!nr_pages_per_cpu) { + _err("invalid MiB"); + usage(); + } + + bounces = atoi(argv[3]); + if (bounces <= 0) { + _err("invalid bounces"); + usage(); + } + nr_pages = nr_pages_per_cpu * nr_cpus; + + if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { + unsigned int memfd_flags = 0; + + if (test_type == TEST_HUGETLB) + memfd_flags = MFD_HUGETLB; + mem_fd = memfd_create(argv[0], memfd_flags); + if (mem_fd < 0) + err("memfd_create"); + if (ftruncate(mem_fd, nr_pages * page_size * 2)) + err("ftruncate"); + if (fallocate(mem_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + nr_pages * page_size * 2)) + err("fallocate"); + } + printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + nr_pages, nr_pages_per_cpu); + return userfaultfd_stress(); +} + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c new file mode 100644 index 000000000000..6857388783be --- /dev/null +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Userfaultfd unit tests. + * + * Copyright (C) 2015-2023 Red Hat, Inc. + */ + +#include "uffd-common.h" + +#ifdef __NR_userfaultfd + +int main(int argc, char *argv[]) +{ + return KSFT_PASS; +} + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("Skipping %s (missing __NR_userfaultfd)\n", __file__); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c deleted file mode 100644 index c68a9aeefc41..000000000000 --- a/tools/testing/selftests/mm/userfaultfd.c +++ /dev/null @@ -1,1168 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Stress userfaultfd syscall. - * - * Copyright (C) 2015 Red Hat, Inc. - * - * This test allocates two virtual areas and bounces the physical - * memory across the two virtual areas (from area_src to area_dst) - * using userfaultfd. - * - * There are three threads running per CPU: - * - * 1) one per-CPU thread takes a per-page pthread_mutex in a random - * page of the area_dst (while the physical page may still be in - * area_src), and increments a per-page counter in the same page, - * and checks its value against a verification region. - * - * 2) another per-CPU thread handles the userfaults generated by - * thread 1 above. userfaultfd blocking reads or poll() modes are - * exercised interleaved. - * - * 3) one last per-CPU thread transfers the memory in the background - * at maximum bandwidth (if not already transferred by thread - * 2). Each cpu thread takes cares of transferring a portion of the - * area. - * - * When all threads of type 3 completed the transfer, one bounce is - * complete. area_src and area_dst are then swapped. All threads are - * respawned and so the bounce is immediately restarted in the - * opposite direction. - * - * per-CPU threads 1 by triggering userfaults inside - * pthread_mutex_lock will also verify the atomicity of the memory - * transfer (UFFDIO_COPY). - */ - -#include "uffd-common.h" - -#ifdef __NR_userfaultfd - -#define BOUNCE_RANDOM (1<<0) -#define BOUNCE_RACINGFAULTS (1<<1) -#define BOUNCE_VERIFY (1<<2) -#define BOUNCE_POLL (1<<3) -static int bounces; - -/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ -#define ALARM_INTERVAL_SECS 10 -static char *zeropage; -pthread_attr_t attr; - -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - -#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) - -const char *examples = - "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" - "./userfaultfd anon 100 99999\n\n" - "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" - "./userfaultfd anon:dev 100 99999\n\n" - "# Run share memory test on 1GiB region with 99 bounces:\n" - "./userfaultfd shmem 1000 99\n\n" - "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" - "./userfaultfd hugetlb 256 50\n\n" - "# Run the same hugetlb test but using shared file:\n" - "./userfaultfd hugetlb_shared 256 50\n\n" - "# 10MiB-~6GiB 999 bounces anonymous test, " - "continue forever unless an error triggers\n" - "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; - -static void usage(void) -{ - fprintf(stderr, "\nUsage: ./userfaultfd " - "[hugetlbfs_file]\n\n"); - fprintf(stderr, "Supported : anon, hugetlb, " - "hugetlb_shared, shmem\n\n"); - fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " - "Supported mods:\n"); - fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); - fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); - fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" - "memory\n"); - fprintf(stderr, "\nExample test mod usage:\n"); - fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); - fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); - - fprintf(stderr, "Examples:\n\n"); - fprintf(stderr, "%s", examples); - exit(1); -} - -static void uffd_stats_reset(struct uffd_stats *uffd_stats, - unsigned long n_cpus) -{ - int i; - - for (i = 0; i < n_cpus; i++) { - uffd_stats[i].cpu = i; - uffd_stats[i].missing_faults = 0; - uffd_stats[i].wp_faults = 0; - uffd_stats[i].minor_faults = 0; - } -} - -static inline uint64_t uffd_minor_feature(void) -{ - if (test_type == TEST_HUGETLB && map_shared) - return UFFD_FEATURE_MINOR_HUGETLBFS; - else if (test_type == TEST_SHMEM) - return UFFD_FEATURE_MINOR_SHMEM; - else - return 0; -} - -static int my_bcmp(char *str1, char *str2, size_t n) -{ - unsigned long i; - for (i = 0; i < n; i++) - if (str1[i] != str2[i]) - return 1; - return 0; -} - -static void *locking_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr; - unsigned long long count; - - if (!(bounces & BOUNCE_RANDOM)) { - page_nr = -bounces; - if (!(bounces & BOUNCE_RACINGFAULTS)) - page_nr += cpu * nr_pages_per_cpu; - } - - while (!finished) { - if (bounces & BOUNCE_RANDOM) { - if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) - err("getrandom failed"); - } else - page_nr += 1; - page_nr %= nr_pages; - pthread_mutex_lock(area_mutex(area_dst, page_nr)); - count = *area_count(area_dst, page_nr); - if (count != count_verify[page_nr]) - err("page_nr %lu memory corruption %llu %llu", - page_nr, count, count_verify[page_nr]); - count++; - *area_count(area_dst, page_nr) = count_verify[page_nr] = count; - pthread_mutex_unlock(area_mutex(area_dst, page_nr)); - } - - return NULL; -} - -static int copy_page_retry(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, true); -} - -pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; - -static void *uffd_read_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - struct uffd_msg msg; - - pthread_mutex_unlock(&uffd_read_mutex); - /* from here cancellation is ok */ - - for (;;) { - if (uffd_read_msg(uffd, &msg)) - continue; - uffd_handle_page_fault(&msg, stats); - } - - return NULL; -} - -static void *background_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr, start_nr, mid_nr, end_nr; - - start_nr = cpu * nr_pages_per_cpu; - end_nr = (cpu+1) * nr_pages_per_cpu; - mid_nr = (start_nr + end_nr) / 2; - - /* Copy the first half of the pages */ - for (page_nr = start_nr; page_nr < mid_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - /* - * If we need to test uffd-wp, set it up now. Then we'll have - * at least the first half of the pages mapped already which - * can be write-protected for testing - */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, - nr_pages_per_cpu * page_size, true); - - /* - * Continue the 2nd half of the page copying, handling write - * protection faults if any - */ - for (page_nr = mid_nr; page_nr < end_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - return NULL; -} - -static int stress(struct uffd_stats *uffd_stats) -{ - unsigned long cpu; - pthread_t locking_threads[nr_cpus]; - pthread_t uffd_threads[nr_cpus]; - pthread_t background_threads[nr_cpus]; - - finished = 0; - for (cpu = 0; cpu < nr_cpus; cpu++) { - if (pthread_create(&locking_threads[cpu], &attr, - locking_thread, (void *)cpu)) - return 1; - if (bounces & BOUNCE_POLL) { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_poll_thread, - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_read_thread, - (void *)&uffd_stats[cpu])) - return 1; - pthread_mutex_lock(&uffd_read_mutex); - } - if (pthread_create(&background_threads[cpu], &attr, - background_thread, (void *)cpu)) - return 1; - } - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(background_threads[cpu], NULL)) - return 1; - - /* - * Be strict and immediately zap area_src, the whole area has - * been transferred already by the background treads. The - * area_src could then be faulted in a racy way by still - * running uffdio_threads reading zeropages after we zapped - * area_src (but they're guaranteed to get -EEXIST from - * UFFDIO_COPY without writing zero pages into area_dst - * because the background threads already completed). - */ - uffd_test_ops->release_pages(area_src); - - finished = 1; - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(locking_threads[cpu], NULL)) - return 1; - - for (cpu = 0; cpu < nr_cpus; cpu++) { - char c; - if (bounces & BOUNCE_POLL) { - if (write(pipefd[cpu*2+1], &c, 1) != 1) - err("pipefd write error"); - if (pthread_join(uffd_threads[cpu], - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_cancel(uffd_threads[cpu])) - return 1; - if (pthread_join(uffd_threads[cpu], NULL)) - return 1; - } - } - - return 0; -} - -sigjmp_buf jbuf, *sigbuf; - -static void sighndl(int sig, siginfo_t *siginfo, void *ptr) -{ - if (sig == SIGBUS) { - if (sigbuf) - siglongjmp(*sigbuf, 1); - abort(); - } -} - -/* - * For non-cooperative userfaultfd test we fork() a process that will - * generate pagefaults, will mremap the area monitored by the - * userfaultfd and at last this process will release the monitored - * area. - * For the anonymous and shared memory the area is divided into two - * parts, the first part is accessed before mremap, and the second - * part is accessed after mremap. Since hugetlbfs does not support - * mremap, the entire monitored area is accessed in a single pass for - * HUGETLB_TEST. - * The release of the pages currently generates event for shmem and - * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked - * for hugetlb. - * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register - * monitored area, generate pagefaults and test that signal is delivered. - * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 - * test robustness use case - we release monitored area, fork a process - * that will generate pagefaults and verify signal is generated. - * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal - * feature. Using monitor thread, verify no userfault events are generated. - */ -static int faulting_process(int signal_test) -{ - unsigned long nr; - unsigned long long count; - unsigned long split_nr_pages; - unsigned long lastnr; - struct sigaction act; - volatile unsigned long signalled = 0; - - split_nr_pages = (nr_pages + 1) / 2; - - if (signal_test) { - sigbuf = &jbuf; - memset(&act, 0, sizeof(act)); - act.sa_sigaction = sighndl; - act.sa_flags = SA_SIGINFO; - if (sigaction(SIGBUS, &act, 0)) - err("sigaction"); - lastnr = (unsigned long)-1; - } - - for (nr = 0; nr < split_nr_pages; nr++) { - volatile int steps = 1; - unsigned long offset = nr * page_size; - - if (signal_test) { - if (sigsetjmp(*sigbuf, 1) != 0) { - if (steps == 1 && nr == lastnr) - err("Signal repeated"); - - lastnr = nr; - if (signal_test == 1) { - if (steps == 1) { - /* This is a MISSING request */ - steps++; - if (copy_page(uffd, offset)) - signalled++; - } else { - /* This is a WP request */ - assert(steps == 2); - wp_range(uffd, - (__u64)area_dst + - offset, - page_size, false); - } - } else { - signalled++; - continue; - } - } - } - - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - if (signal_test) - return signalled != split_nr_pages; - - area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, area_src); - if (area_dst == MAP_FAILED) - err("mremap"); - /* Reset area_src since we just clobbered it */ - area_src = NULL; - - for (; nr < nr_pages; nr++) { - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) { - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - } - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - uffd_test_ops->release_pages(area_dst); - - for (nr = 0; nr < nr_pages; nr++) - if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) - err("nr %lu is not zero", nr); - - return 0; -} - -static void retry_uffdio_zeropage(int ufd, - struct uffdio_zeropage *uffdio_zeropage, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, - uffdio_zeropage->range.len, - offset); - if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { - if (uffdio_zeropage->zeropage != -EEXIST) - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } else { - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } -} - -static int __uffdio_zeropage(int ufd, unsigned long offset) -{ - struct uffdio_zeropage uffdio_zeropage; - int ret; - bool has_zeropage = !(test_type == TEST_HUGETLB); - __s64 res; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu", offset); - uffdio_zeropage.range.start = (unsigned long) area_dst + offset; - uffdio_zeropage.range.len = page_size; - uffdio_zeropage.mode = 0; - ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); - res = uffdio_zeropage.zeropage; - if (ret) { - /* real retval in ufdio_zeropage.zeropage */ - if (has_zeropage) - err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); - else if (res != -EINVAL) - err("UFFDIO_ZEROPAGE not -EINVAL"); - } else if (has_zeropage) { - if (res != page_size) { - err("UFFDIO_ZEROPAGE unexpected size"); - } else { - retry_uffdio_zeropage(ufd, &uffdio_zeropage, - offset); - return 1; - } - } else - err("UFFDIO_ZEROPAGE succeeded"); - - return 0; -} - -static int uffdio_zeropage(int ufd, unsigned long offset) -{ - return __uffdio_zeropage(ufd, offset); -} - -/* exercise UFFDIO_ZEROPAGE */ -static int userfaultfd_zeropage_test(void) -{ - struct uffdio_register uffdio_register; - - printf("testing UFFDIO_ZEROPAGE: "); - fflush(stdout); - - uffd_test_ctx_init(0); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (area_dst_alias) { - /* Needed this to test zeropage-retry on shared memory */ - uffdio_register.range.start = (unsigned long) area_dst_alias; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - } - - if (uffdio_zeropage(uffd, 0)) - if (my_bcmp(area_dst, zeropage, page_size)) - err("zeropage is not zero"); - - printf("done.\n"); - return 0; -} - -static int userfaultfd_events_test(void) -{ - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing events (fork, remap, remove): "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | - UFFD_FEATURE_EVENT_REMOVE; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(0)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - return stats.missing_faults != nr_pages; -} - -static int userfaultfd_sig_test(void) -{ - struct uffdio_register uffdio_register; - unsigned long userfaults; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing signal delivery: "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (faulting_process(1)) - err("faulting process failed"); - - uffd_test_ops->release_pages(area_dst); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(2)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, (void **)&userfaults)) - return 1; - - printf("done.\n"); - if (userfaults) - err("Signal test failed, userfaults: %ld", userfaults); - - return userfaults != 0; -} - -void check_memory_contents(char *p) -{ - unsigned long i; - uint8_t expected_byte; - void *expected_page; - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (i = 0; i < nr_pages; ++i) { - expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, p + (i * page_size), page_size)) - err("unexpected page contents after minor fault"); - } - - free(expected_page); -} - -static int userfaultfd_minor_test(void) -{ - unsigned long p; - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - char c; - struct uffd_stats stats = { 0 }; - - if (!test_uffdio_minor) - return 0; - - printf("testing minor faults: "); - fflush(stdout); - - uffd_test_ctx_init(uffd_minor_feature()); - - uffdio_register.range.start = (unsigned long)area_dst_alias; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - /* - * After registering with UFFD, populate the non-UFFD-registered side of - * the shared mapping. This should *not* trigger any UFFD minor faults. - */ - for (p = 0; p < nr_pages; ++p) { - memset(area_dst + (p * page_size), p % ((uint8_t)-1), - page_size); - } - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - /* - * Read each of the pages back using the UFFD-registered mapping. We - * expect that the first time we touch a page, it will result in a minor - * fault. uffd_poll_thread will resolve the fault by bit-flipping the - * page's contents, and then issuing a CONTINUE ioctl. - */ - check_memory_contents(area_dst_alias); - - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - if (test_collapse) { - printf("testing collapse of uffd memory into PMD-mapped THPs:"); - if (madvise(area_dst_alias, nr_pages * page_size, - MADV_COLLAPSE)) - err("madvise(MADV_COLLAPSE)"); - - uffd_test_ops->check_pmd_mapping(area_dst, - nr_pages * page_size / - hpage_size); - /* - * This won't cause uffd-fault - it purely just makes sure there - * was no corruption. - */ - check_memory_contents(area_dst_alias); - printf(" done.\n"); - } - - return stats.missing_faults != 0 || stats.minor_faults != nr_pages; -} - -static int pagemap_open(void) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - - if (fd < 0) - err("open pagemap"); - - return fd; -} - -/* This macro let __LINE__ works in err() */ -#define pagemap_check_wp(value, wp) do { \ - if (!!(value & PM_UFFD_WP) != wp) \ - err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ - } while (0) - -static int pagemap_test_fork(bool present) -{ - pid_t child = fork(); - uint64_t value; - int fd, result; - - if (!child) { - /* Open the pagemap fd of the child itself */ - fd = pagemap_open(); - value = pagemap_get_entry(fd, area_dst); - /* - * After fork() uffd-wp bit should be gone as long as we're - * without UFFD_FEATURE_EVENT_FORK - */ - pagemap_check_wp(value, false); - /* Succeed */ - exit(0); - } - waitpid(child, &result, 0); - return result; -} - -static void userfaultfd_wp_unpopulated_test(int pagemap_fd) -{ - uint64_t value; - - /* Test applying pte marker to anon unpopulated */ - wp_range(uffd, (uint64_t)area_dst, page_size, true); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - - /* Test unprotect on anon pte marker */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Test zap on anon marker */ - wp_range(uffd, (uint64_t)area_dst, page_size, true); - if (madvise(area_dst, page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Test fault in after marker removed */ - *area_dst = 1; - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - /* Drop it to make pte none again */ - if (madvise(area_dst, page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - - /* Test read-zero-page upon pte marker */ - wp_range(uffd, (uint64_t)area_dst, page_size, true); - *(volatile char *)area_dst; - /* Drop it to make pte none again */ - if (madvise(area_dst, page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); -} - -static void userfaultfd_pagemap_test(unsigned int test_pgsize) -{ - struct uffdio_register uffdio_register; - int pagemap_fd; - uint64_t value; - - /* Pagemap tests uffd-wp only */ - if (!test_uffdio_wp) - return; - - /* Not enough memory to test this page size */ - if (test_pgsize > nr_pages * page_size) - return; - - printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); - /* Flush so it doesn't flush twice in parent/child later */ - fflush(stdout); - - uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); - - if (test_pgsize > page_size) { - /* This is a thp test */ - if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) - err("madvise(MADV_HUGEPAGE) failed"); - } else if (test_pgsize == page_size) { - /* This is normal page test; force no thp */ - if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) - err("madvise(MADV_NOHUGEPAGE) failed"); - } - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failed"); - - pagemap_fd = pagemap_open(); - - /* Smoke test WP_UNPOPULATED first when it's still empty */ - if (test_pgsize == page_size) - userfaultfd_wp_unpopulated_test(pagemap_fd); - - /* Touch the page */ - *area_dst = 1; - wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(true)) - err("Detected stall uffd-wp bit in child"); - - /* Exclusive required or PAGEOUT won't work */ - if (!(value & PM_MMAP_EXCLUSIVE)) - err("multiple mapping detected: 0x%"PRIx64, value); - - if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) - err("madvise(MADV_PAGEOUT) failed"); - - /* Uffd-wp should persist even swapped out */ - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(false)) - err("Detected stall uffd-wp bit in child"); - - /* Unprotect; this tests swap pte modifications */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Fault in the page from disk */ - *area_dst = 2; - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - close(pagemap_fd); - printf("done\n"); -} - -static int userfaultfd_stress(void) -{ - void *area; - unsigned long nr; - struct uffdio_register uffdio_register; - struct uffd_stats uffd_stats[nr_cpus]; - - uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); - - if (posix_memalign(&area, page_size, page_size)) - err("out of memory"); - zeropage = area; - bzero(zeropage, page_size); - - pthread_mutex_lock(&uffd_read_mutex); - - pthread_attr_init(&attr); - pthread_attr_setstacksize(&attr, 16*1024*1024); - - while (bounces--) { - printf("bounces: %d, mode:", bounces); - if (bounces & BOUNCE_RANDOM) - printf(" rnd"); - if (bounces & BOUNCE_RACINGFAULTS) - printf(" racing"); - if (bounces & BOUNCE_VERIFY) - printf(" ver"); - if (bounces & BOUNCE_POLL) - printf(" poll"); - else - printf(" read"); - printf(", "); - fflush(stdout); - - if (bounces & BOUNCE_POLL) - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - else - fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); - - /* register */ - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) - area_dst_alias; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure alias"); - } - - /* - * The madvise done previously isn't enough: some - * uffd_thread could have read userfaults (one of - * those already resolved by the background thread) - * and it may be in the process of calling - * UFFDIO_COPY. UFFDIO_COPY will read the zapped - * area_src and it would map a zero page in it (of - * course such a UFFDIO_COPY is perfectly safe as it'd - * return -EEXIST). The problem comes at the next - * bounce though: that racing UFFDIO_COPY would - * generate zeropages in the area_src, so invalidating - * the previous MADV_DONTNEED. Without this additional - * MADV_DONTNEED those zeropages leftovers in the - * area_src would lead to -EEXIST failure during the - * next bounce, effectively leaving a zeropage in the - * area_dst. - * - * Try to comment this out madvise to see the memory - * corruption being caught pretty quick. - * - * khugepaged is also inhibited to collapse THP after - * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's - * required to MADV_DONTNEED here. - */ - uffd_test_ops->release_pages(area_dst); - - uffd_stats_reset(uffd_stats, nr_cpus); - - /* bounce pass */ - if (stress(uffd_stats)) - return 1; - - /* Clear all the write protections if there is any */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst, - nr_pages * page_size, false); - - /* unregister */ - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) - err("unregister failure"); - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) area_dst; - if (ioctl(uffd, UFFDIO_UNREGISTER, - &uffdio_register.range)) - err("unregister failure alias"); - } - - /* verification */ - if (bounces & BOUNCE_VERIFY) - for (nr = 0; nr < nr_pages; nr++) - if (*area_count(area_dst, nr) != count_verify[nr]) - err("error area_count %llu %llu %lu\n", - *area_count(area_src, nr), - count_verify[nr], nr); - - /* prepare next bounce */ - swap(area_src, area_dst); - - swap(area_src_alias, area_dst_alias); - - uffd_stats_report(uffd_stats, nr_cpus); - } - - if (test_type == TEST_ANON) { - /* - * shmem/hugetlb won't be able to run since they have different - * behavior on fork() (file-backed memory normally drops ptes - * directly when fork), meanwhile the pagemap test will verify - * pgtable entry of fork()ed child. - */ - userfaultfd_pagemap_test(page_size); - /* - * Hard-code for x86_64 for now for 2M THP, as x86_64 is - * currently the only one that supports uffd-wp - */ - userfaultfd_pagemap_test(page_size * 512); - } - - return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test() || userfaultfd_minor_test(); -} - -static void set_test_type(const char *type) -{ - if (!strcmp(type, "anon")) { - test_type = TEST_ANON; - uffd_test_ops = &anon_uffd_test_ops; - } else if (!strcmp(type, "hugetlb")) { - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - } else if (!strcmp(type, "hugetlb_shared")) { - map_shared = true; - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - /* Minor faults require shared hugetlb; only enable here. */ - test_uffdio_minor = true; - } else if (!strcmp(type, "shmem")) { - map_shared = true; - test_type = TEST_SHMEM; - uffd_test_ops = &shmem_uffd_test_ops; - test_uffdio_minor = true; - } -} - -static void parse_test_type_arg(const char *raw_type) -{ - char *buf = strdup(raw_type); - uint64_t features = UFFD_API_FEATURES; - - while (buf) { - const char *token = strsep(&buf, ":"); - - if (!test_type) - set_test_type(token); - else if (!strcmp(token, "dev")) - test_dev_userfaultfd = true; - else if (!strcmp(token, "syscall")) - test_dev_userfaultfd = false; - else if (!strcmp(token, "collapse")) - test_collapse = true; - else - err("unrecognized test mod '%s'", token); - } - - if (!test_type) - err("failed to parse test type argument: '%s'", raw_type); - - if (test_collapse && test_type != TEST_SHMEM) - err("Unsupported test: %s", raw_type); - - if (test_type == TEST_HUGETLB) - page_size = hpage_size; - else - page_size = sysconf(_SC_PAGE_SIZE); - - if (!page_size) - err("Unable to determine page size"); - if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) - err("Impossible to run this test"); - - /* - * Whether we can test certain features depends not just on test type, - * but also on whether or not this particular kernel supports the - * feature. - */ - - userfaultfd_open(&features); - - test_uffdio_wp = test_uffdio_wp && - (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); - test_uffdio_minor = test_uffdio_minor && - (features & uffd_minor_feature()); - - close(uffd); - uffd = -1; -} - -static void sigalrm(int sig) -{ - if (sig != SIGALRM) - abort(); - test_uffdio_copy_eexist = true; - alarm(ALARM_INTERVAL_SECS); -} - -int main(int argc, char **argv) -{ - size_t bytes; - - if (argc < 4) - usage(); - - if (signal(SIGALRM, sigalrm) == SIG_ERR) - err("failed to arm SIGALRM"); - alarm(ALARM_INTERVAL_SECS); - - hpage_size = default_huge_page_size(); - parse_test_type_arg(argv[1]); - bytes = atol(argv[2]) * 1024 * 1024; - - if (test_collapse && bytes & (hpage_size - 1)) - err("MiB must be multiple of %lu if :collapse mod set", - hpage_size >> 20); - - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - - if (test_collapse) { - /* nr_cpus must divide (bytes / page_size), otherwise, - * area allocations of (nr_pages * paze_size) won't be a - * multiple of hpage_size, even if bytes is a multiple of - * hpage_size. - * - * This means that nr_cpus must divide (N * (2 << (H-P)) - * where: - * bytes = hpage_size * N - * hpage_size = 2 << H - * page_size = 2 << P - * - * And we want to chose nr_cpus to be the largest value - * satisfying this constraint, not larger than the number - * of online CPUs. Unfortunately, prime factorization of - * N and nr_cpus may be arbitrary, so have to search for it. - * Instead, just use the highest power of 2 dividing both - * nr_cpus and (bytes / page_size). - */ - int x = factor_of_2(nr_cpus); - int y = factor_of_2(bytes / page_size); - - nr_cpus = x < y ? x : y; - } - nr_pages_per_cpu = bytes / page_size / nr_cpus; - if (!nr_pages_per_cpu) { - _err("invalid MiB"); - usage(); - } - - bounces = atoi(argv[3]); - if (bounces <= 0) { - _err("invalid bounces"); - usage(); - } - nr_pages = nr_pages_per_cpu * nr_cpus; - - if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { - unsigned int memfd_flags = 0; - - if (test_type == TEST_HUGETLB) - memfd_flags = MFD_HUGETLB; - mem_fd = memfd_create(argv[0], memfd_flags); - if (mem_fd < 0) - err("memfd_create"); - if (ftruncate(mem_fd, nr_pages * page_size * 2)) - err("ftruncate"); - if (fallocate(mem_fd, - FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, - nr_pages * page_size * 2)) - err("fallocate"); - } - printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", - nr_pages, nr_pages_per_cpu); - return userfaultfd_stress(); -} - -#else /* __NR_userfaultfd */ - -#warning "missing __NR_userfaultfd definition" - -int main(void) -{ - printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_userfaultfd */ -- cgit v1.2.3 From c4277cb6c8e5dc60d425f3a148b3e2bf40d8d778 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:47 -0400 Subject: selftests/mm: uffd_[un]register() Add two helpers to register/unregister to an uffd. Use them to drop duplicate codes. This patch also drops assert_expected_ioctls_present() and get_expected_ioctls(). Reasons: - It'll need a lot of effort to pass test_type==HUGETLB into it from the upper, so it's the simplest way to get rid of another global var - The ioctls returned in UFFDIO_REGISTER is hardly useful at all, because any app can already detect kernel support on any ioctl via its corresponding UFFD_FEATURE_*. The check here is for sanity mostly but it's probably destined no user app will even use it. - It's not friendly to one future goal of uffd to run on old kernels, the problem is get_expected_ioctls() compiles against UFFD_API_RANGE_IOCTLS, which is a value that can change depending on where the test is compiled, rather than reflecting what the kernel underneath has. It means it'll report false negatives on old kernels so it's against our will. So let's make our lives easier. [peterx@redhat.com; tools/testing/selftests/mm/hugepage-mremap.c: add headers] Link: https://lkml.kernel.org/r/ZDxrvZh/cw357D8P@x1n Link: https://lkml.kernel.org/r/20230412164247.328293-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugepage-mremap.c | 9 +-- tools/testing/selftests/mm/ksm_functional_tests.c | 6 +- tools/testing/selftests/mm/uffd-common.c | 27 -------- tools/testing/selftests/mm/uffd-common.h | 2 - tools/testing/selftests/mm/uffd-stress.c | 83 ++++++----------------- tools/testing/selftests/mm/vm_util.c | 37 ++++++++++ tools/testing/selftests/mm/vm_util.h | 4 ++ 7 files changed, 64 insertions(+), 104 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c index e53b5eaa8fce..cabd0084f57b 100644 --- a/tools/testing/selftests/mm/hugepage-mremap.c +++ b/tools/testing/selftests/mm/hugepage-mremap.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include "vm_util.h" #define DEFAULT_LENGTH_MB 10UL #define MB_TO_BYTES(x) (x * 1024 * 1024) @@ -60,7 +62,6 @@ static void register_region_with_uffd(char *addr, size_t len) { long uffd; /* userfaultfd file descriptor */ struct uffdio_api uffdio_api; - struct uffdio_register uffdio_register; /* Create and enable userfaultfd object. */ @@ -96,11 +97,7 @@ static void register_region_with_uffd(char *addr, size_t len) * handling by the userfaultfd object. In mode, we request to track * missing pages (i.e., pages that have not yet been faulted in). */ - - uffdio_register.range.start = (unsigned long)addr; - uffdio_register.range.len = len; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + if (uffd_register(uffd, addr, len, true, false, false)) { perror("ioctl-UFFDIO_REGISTER"); exit(1); } diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index d8b5b4930412..d3f26050dfd7 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -178,7 +178,6 @@ unmap: static void test_unmerge_uffd_wp(void) { struct uffdio_writeprotect uffd_writeprotect; - struct uffdio_register uffdio_register; const unsigned int size = 2 * MiB; struct uffdio_api uffdio_api; char *map; @@ -210,10 +209,7 @@ static void test_unmerge_uffd_wp(void) } /* Register UFFD-WP, no need for an actual handler. */ - uffdio_register.range.start = (unsigned long) map; - uffdio_register.range.len = size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { + if (uffd_register(uffd, map, size, false, true, false)) { ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n"); goto close_uffd; } diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index c57757c2a36f..daa5b5781e7a 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -330,33 +330,6 @@ void uffd_test_ctx_init(uint64_t features) err("pipe"); } -uint64_t get_expected_ioctls(uint64_t mode) -{ - uint64_t ioctls = UFFD_API_RANGE_IOCTLS; - - if (test_type == TEST_HUGETLB) - ioctls &= ~(1 << _UFFDIO_ZEROPAGE); - - if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) - ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); - - if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) - ioctls &= ~(1 << _UFFDIO_CONTINUE); - - return ioctls; -} - -void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) -{ - uint64_t expected = get_expected_ioctls(mode); - uint64_t actual = ioctls & expected; - - if (actual != expected) { - err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, - expected, actual); - } -} - void wp_range(int ufd, __u64 start, __u64 len, bool wp) { struct uffdio_writeprotect prms; diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index d9430cfdcb19..11f770391bd9 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -101,8 +101,6 @@ extern uffd_test_ops_t *uffd_test_ops; void uffd_stats_report(struct uffd_stats *stats, int n_cpus); void uffd_test_ctx_init(uint64_t features); void userfaultfd_open(uint64_t *features); -uint64_t get_expected_ioctls(uint64_t mode); -void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls); int uffd_read_msg(int ufd, struct uffd_msg *msg); void wp_range(int ufd, __u64 start, __u64 len, bool wp); void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats); diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index c68a9aeefc41..e6d39a755082 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -463,28 +463,19 @@ static int uffdio_zeropage(int ufd, unsigned long offset) /* exercise UFFDIO_ZEROPAGE */ static int userfaultfd_zeropage_test(void) { - struct uffdio_register uffdio_register; - printf("testing UFFDIO_ZEROPAGE: "); fflush(stdout); uffd_test_ctx_init(0); - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + if (uffd_register(uffd, area_dst, nr_pages * page_size, + true, test_uffdio_wp, false)) err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - if (area_dst_alias) { /* Needed this to test zeropage-retry on shared memory */ - uffdio_register.range.start = (unsigned long) area_dst_alias; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + if (uffd_register(uffd, area_dst_alias, nr_pages * page_size, + true, test_uffdio_wp, false)) err("register failure"); } @@ -498,7 +489,6 @@ static int userfaultfd_zeropage_test(void) static int userfaultfd_events_test(void) { - struct uffdio_register uffdio_register; pthread_t uffd_mon; int err, features; pid_t pid; @@ -514,17 +504,10 @@ static int userfaultfd_events_test(void) fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + if (uffd_register(uffd, area_dst, nr_pages * page_size, + true, test_uffdio_wp, false)) err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) err("uffd_poll_thread create"); @@ -550,7 +533,6 @@ static int userfaultfd_events_test(void) static int userfaultfd_sig_test(void) { - struct uffdio_register uffdio_register; unsigned long userfaults; pthread_t uffd_mon; int err, features; @@ -566,17 +548,10 @@ static int userfaultfd_sig_test(void) fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + if (uffd_register(uffd, area_dst, nr_pages * page_size, + true, test_uffdio_wp, false)) err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - if (faulting_process(1)) err("faulting process failed"); @@ -629,7 +604,6 @@ void check_memory_contents(char *p) static int userfaultfd_minor_test(void) { unsigned long p; - struct uffdio_register uffdio_register; pthread_t uffd_mon; char c; struct uffd_stats stats = { 0 }; @@ -642,17 +616,10 @@ static int userfaultfd_minor_test(void) uffd_test_ctx_init(uffd_minor_feature()); - uffdio_register.range.start = (unsigned long)area_dst_alias; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + if (uffd_register(uffd, area_dst_alias, nr_pages * page_size, + false, test_uffdio_wp, true)) err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - /* * After registering with UFFD, populate the non-UFFD-registered side of * the shared mapping. This should *not* trigger any UFFD minor faults. @@ -777,7 +744,6 @@ static void userfaultfd_wp_unpopulated_test(int pagemap_fd) static void userfaultfd_pagemap_test(unsigned int test_pgsize) { - struct uffdio_register uffdio_register; int pagemap_fd; uint64_t value; @@ -805,10 +771,8 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) err("madvise(MADV_NOHUGEPAGE) failed"); } - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + if (uffd_register(uffd, area_dst, nr_pages * page_size, + false, true, false)) err("register failed"); pagemap_fd = pagemap_open(); @@ -858,8 +822,8 @@ static int userfaultfd_stress(void) { void *area; unsigned long nr; - struct uffdio_register uffdio_register; struct uffd_stats uffd_stats[nr_cpus]; + uint64_t mem_size = nr_pages * page_size; uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); @@ -894,20 +858,13 @@ static int userfaultfd_stress(void) fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); /* register */ - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + if (uffd_register(uffd, area_dst, mem_size, + true, test_uffdio_wp, false)) err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) - area_dst_alias; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + if (uffd_register(uffd, area_dst_alias, mem_size, + true, test_uffdio_wp, false)) err("register failure alias"); } @@ -949,12 +906,10 @@ static int userfaultfd_stress(void) nr_pages * page_size, false); /* unregister */ - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) + if (uffd_unregister(uffd, area_dst, mem_size)) err("unregister failure"); if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) area_dst; - if (ioctl(uffd, UFFDIO_UNREGISTER, - &uffdio_register.range)) + if (uffd_unregister(uffd, area_dst_alias, mem_size)) err("unregister failure alias"); } diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 54d227d6f70a..bb633d050d71 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include +#include #include "../kselftest.h" #include "vm_util.h" @@ -193,3 +195,38 @@ unsigned long default_huge_page_size(void) fclose(f); return hps; } + +int uffd_register(int uffd, void *addr, uint64_t len, + bool miss, bool wp, bool minor) +{ + struct uffdio_register uffdio_register = { 0 }; + uint64_t mode = 0; + int ret = 0; + + if (miss) + mode |= UFFDIO_REGISTER_MODE_MISSING; + if (wp) + mode |= UFFDIO_REGISTER_MODE_WP; + if (minor) + mode |= UFFDIO_REGISTER_MODE_MINOR; + + uffdio_register.range.start = (unsigned long)addr; + uffdio_register.range.len = len; + uffdio_register.mode = mode; + + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) + ret = -errno; + + return ret; +} + +int uffd_unregister(int uffd, void *addr, uint64_t len) +{ + struct uffdio_range range = { .start = (uintptr_t)addr, .len = len }; + int ret = 0; + + if (ioctl(uffd, UFFDIO_UNREGISTER, &range) == -1) + ret = -errno; + + return ret; +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index d9fadddb5c69..3a9762022efd 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -45,6 +45,10 @@ bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); int64_t allocate_transhuge(void *ptr, int pagemap_fd); unsigned long default_huge_page_size(void); +int uffd_register(int uffd, void *addr, uint64_t len, + bool miss, bool wp, bool minor); +int uffd_unregister(int uffd, void *addr, uint64_t len); + /* * On ppc64 this will only work with radix 2M hugepage size */ -- cgit v1.2.3 From 78391f6460ee2cb43f64bbdd7a7cf840e5a118a6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:54 -0400 Subject: selftests/mm: uffd_open_{dev|sys}() Provide two helpers to open an uffd handle. Drop the error checks around SKIPs because it's inside an errexit() anyway, which IMHO doesn't really help much if the test will not continue. Link: https://lkml.kernel.org/r/20230412164254.328335-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Axel Rasmussen Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 28 +++++----------------------- tools/testing/selftests/mm/vm_util.c | 24 ++++++++++++++++++++++++ tools/testing/selftests/mm/vm_util.h | 2 ++ 3 files changed, 31 insertions(+), 23 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index daa5b5781e7a..09ea24c5f02c 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -192,34 +192,16 @@ void uffd_stats_report(struct uffd_stats *stats, int n_cpus) printf("\n"); } -static int __userfaultfd_open_dev(void) -{ - int fd, _uffd; - - fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); - if (fd < 0) - errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); - - _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); - if (_uffd < 0) - errexit(errno == ENOTTY ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - close(fd); - return _uffd; -} - void userfaultfd_open(uint64_t *features) { struct uffdio_api uffdio_api; if (test_dev_userfaultfd) - uffd = __userfaultfd_open_dev(); - else { - uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); - if (uffd < 0) - errexit(errno == ENOSYS ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - } + uffd = uffd_open_dev(UFFD_FLAGS); + else + uffd = uffd_open_sys(UFFD_FLAGS); + if (uffd < 0) + err("uffd open failed (dev=%d)", test_dev_userfaultfd); uffd_flags = fcntl(uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index bb633d050d71..5ee6c4688a7c 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include "../kselftest.h" #include "vm_util.h" @@ -230,3 +232,25 @@ int uffd_unregister(int uffd, void *addr, uint64_t len) return ret; } + +int uffd_open_dev(unsigned int flags) +{ + int fd, uffd; + + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (fd < 0) + return fd; + uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags); + close(fd); + + return uffd; +} + +int uffd_open_sys(unsigned int flags) +{ +#ifdef __NR_userfaultfd + return syscall(__NR_userfaultfd, flags); +#else + return -1; +#endif +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 3a9762022efd..481354141533 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -48,6 +48,8 @@ unsigned long default_huge_page_size(void); int uffd_register(int uffd, void *addr, uint64_t len, bool miss, bool wp, bool minor); int uffd_unregister(int uffd, void *addr, uint64_t len); +int uffd_open_dev(unsigned int flags); +int uffd_open_sys(unsigned int flags); /* * On ppc64 this will only work with radix 2M hugepage size -- cgit v1.2.3 From d5433ce84d2572d43a3c58e5cab21db200669bfd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:42:57 -0400 Subject: selftests/mm: UFFDIO_API test Add one simple test for UFFDIO_API. With that, I also added a bunch of small but handy helpers along the way. Link: https://lkml.kernel.org/r/20230412164257.328375-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 109 ++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 6857388783be..bb492c258486 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -9,9 +9,116 @@ #ifdef __NR_userfaultfd +static void uffd_test_report(void) +{ + printf("Userfaults unit tests: pass=%u, skip=%u, fail=%u (total=%u)\n", + ksft_get_pass_cnt(), + ksft_get_xskip_cnt(), + ksft_get_fail_cnt(), + ksft_test_num()); +} + +static void uffd_test_pass(void) +{ + printf("done\n"); + ksft_inc_pass_cnt(); +} + +#define uffd_test_start(...) do { \ + printf("Testing "); \ + printf(__VA_ARGS__); \ + printf("... "); \ + fflush(stdout); \ + } while (0) + +#define uffd_test_fail(...) do { \ + printf("failed [reason: "); \ + printf(__VA_ARGS__); \ + printf("]\n"); \ + ksft_inc_fail_cnt(); \ + } while (0) + +#define uffd_test_skip(...) do { \ + printf("skipped [reason: "); \ + printf(__VA_ARGS__); \ + printf("]\n"); \ + ksft_inc_xskip_cnt(); \ + } while (0) + +/* + * Returns 1 if specific userfaultfd supported, 0 otherwise. Note, we'll + * return 1 even if some test failed as long as uffd supported, because in + * that case we still want to proceed with the rest uffd unit tests. + */ +static int test_uffd_api(bool use_dev) +{ + struct uffdio_api uffdio_api; + int uffd; + + uffd_test_start("UFFDIO_API (with %s)", + use_dev ? "/dev/userfaultfd" : "syscall"); + + if (use_dev) + uffd = uffd_open_dev(UFFD_FLAGS); + else + uffd = uffd_open_sys(UFFD_FLAGS); + if (uffd < 0) { + uffd_test_skip("cannot open userfaultfd handle"); + return 0; + } + + /* Test wrong UFFD_API */ + uffdio_api.api = 0xab; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) { + uffd_test_fail("UFFDIO_API should fail with wrong api but didn't"); + goto out; + } + + /* Test wrong feature bit */ + uffdio_api.api = UFFD_API; + uffdio_api.features = BIT_ULL(63); + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) { + uffd_test_fail("UFFDIO_API should fail with wrong feature but didn't"); + goto out; + } + + /* Test normal UFFDIO_API */ + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { + uffd_test_fail("UFFDIO_API should succeed but failed"); + goto out; + } + + /* Test double requests of UFFDIO_API with a random feature set */ + uffdio_api.features = BIT_ULL(0); + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) { + uffd_test_fail("UFFDIO_API should reject initialized uffd"); + goto out; + } + + uffd_test_pass(); +out: + close(uffd); + /* We have a valid uffd handle */ + return 1; +} + int main(int argc, char *argv[]) { - return KSFT_PASS; + int has_uffd; + + has_uffd = test_uffd_api(false); + has_uffd |= test_uffd_api(true); + + if (!has_uffd) { + printf("Userfaultfd not supported or unprivileged, skip all tests\n"); + exit(KSFT_SKIP); + } + uffd_test_report(); + + return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS; } #else /* __NR_userfaultfd */ -- cgit v1.2.3 From c5cb903646f4df0ab3760dcb2e5571b528b6db59 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:43:31 -0400 Subject: selftests/mm: drop global mem_fd in uffd tests Drop it by creating the memfd dynamically in the tests. Link: https://lkml.kernel.org/r/20230412164331.328584-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Rapoport (IBM) Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 28 +++++++++++++++++++++++++++- tools/testing/selftests/mm/uffd-common.h | 2 +- tools/testing/selftests/mm/uffd-stress.c | 15 --------------- 3 files changed, 28 insertions(+), 17 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 09ea24c5f02c..b1617f5d4517 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -12,12 +12,32 @@ volatile bool test_uffdio_copy_eexist = true; unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; -int mem_fd, uffd = -1, uffd_flags, finished, *pipefd, test_type; +int uffd = -1, uffd_flags, finished, *pipefd, test_type; bool map_shared, test_collapse, test_dev_userfaultfd; bool test_uffdio_wp = true, test_uffdio_minor = false; unsigned long long *count_verify; uffd_test_ops_t *uffd_test_ops; +static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) +{ + unsigned int memfd_flags = 0; + int mem_fd; + + if (hugetlb) + memfd_flags = MFD_HUGETLB; + mem_fd = memfd_create("uffd-test", memfd_flags); + if (mem_fd < 0) + err("memfd_create"); + if (ftruncate(mem_fd, mem_size)) + err("ftruncate"); + if (fallocate(mem_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + mem_size)) + err("fallocate"); + + return mem_fd; +} + static void anon_release_pages(char *rel_area) { if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) @@ -51,6 +71,7 @@ static void hugetlb_allocate_area(void **alloc_area, bool is_src) off_t offset = is_src ? 0 : size; void *area_alias = NULL; char **alloc_area_alias; + int mem_fd = uffd_mem_fd_create(size * 2, true); *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, (map_shared ? MAP_SHARED : MAP_PRIVATE) | @@ -73,6 +94,8 @@ static void hugetlb_allocate_area(void **alloc_area, bool is_src) } if (area_alias) *alloc_area_alias = area_alias; + + close(mem_fd); } static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -95,6 +118,7 @@ static void shmem_allocate_area(void **alloc_area, bool is_src) size_t bytes = nr_pages * page_size; unsigned long offset = is_src ? 0 : bytes; char *p = NULL, *p_alias = NULL; + int mem_fd = uffd_mem_fd_create(bytes * 2, false); if (test_collapse) { p = BASE_PMD_ADDR; @@ -124,6 +148,8 @@ static void shmem_allocate_area(void **alloc_area, bool is_src) area_src_alias = area_alias; else area_dst_alias = area_alias; + + close(mem_fd); } static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 11f770391bd9..0dfab7057295 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -87,7 +87,7 @@ typedef struct uffd_test_ops uffd_test_ops_t; extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; -extern int mem_fd, uffd, uffd_flags, finished, *pipefd, test_type; +extern int uffd, uffd_flags, finished, *pipefd, test_type; extern bool map_shared, test_collapse, test_dev_userfaultfd; extern bool test_uffdio_wp, test_uffdio_minor; extern unsigned long long *count_verify; diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index e6d39a755082..4eca1a0276c2 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -1090,21 +1090,6 @@ int main(int argc, char **argv) } nr_pages = nr_pages_per_cpu * nr_cpus; - if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { - unsigned int memfd_flags = 0; - - if (test_type == TEST_HUGETLB) - memfd_flags = MFD_HUGETLB; - mem_fd = memfd_create(argv[0], memfd_flags); - if (mem_fd < 0) - err("memfd_create"); - if (ftruncate(mem_fd, nr_pages * page_size * 2)) - err("ftruncate"); - if (fallocate(mem_fd, - FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, - nr_pages * page_size * 2)) - err("fallocate"); - } printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", nr_pages, nr_pages_per_cpu); return userfaultfd_stress(); -- cgit v1.2.3 From 265818ef988b3a4cd12fc8885966413b78f06d96 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:43:33 -0400 Subject: selftests/mm: drop global hpage_size in uffd tests hpage_size was wrongly used. Sometimes it means hugetlb default size, sometimes it was used as thp size. Remove the global variable and use the right one at each place. Link: https://lkml.kernel.org/r/20230412164333.328596-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Rapoport (IBM) Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 7 ++++--- tools/testing/selftests/mm/uffd-common.h | 2 +- tools/testing/selftests/mm/uffd-stress.c | 6 +++--- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index b1617f5d4517..f02dfcf10714 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -10,7 +10,7 @@ #define BASE_PMD_ADDR ((void *)(1UL << 30)) volatile bool test_uffdio_copy_eexist = true; -unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; +unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; int uffd = -1, uffd_flags, finished, *pipefd, test_type; bool map_shared, test_collapse, test_dev_userfaultfd; @@ -115,7 +115,7 @@ static void shmem_release_pages(char *rel_area) static void shmem_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; - size_t bytes = nr_pages * page_size; + size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize(); unsigned long offset = is_src ? 0 : bytes; char *p = NULL, *p_alias = NULL; int mem_fd = uffd_mem_fd_create(bytes * 2, false); @@ -159,7 +159,8 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) { - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, + read_pmd_pagesize())) err("Did not find expected %d number of hugepages", expect_nr_hpages); } diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 0dfab7057295..47565b2f2dee 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -85,7 +85,7 @@ struct uffd_test_ops { }; typedef struct uffd_test_ops uffd_test_ops_t; -extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; +extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; extern int uffd, uffd_flags, finished, *pipefd, test_type; extern bool map_shared, test_collapse, test_dev_userfaultfd; diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 4eca1a0276c2..54fc9b4ffa3c 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -655,7 +655,7 @@ static int userfaultfd_minor_test(void) uffd_test_ops->check_pmd_mapping(area_dst, nr_pages * page_size / - hpage_size); + read_pmd_pagesize()); /* * This won't cause uffd-fault - it purely just makes sure there * was no corruption. @@ -997,7 +997,7 @@ static void parse_test_type_arg(const char *raw_type) err("Unsupported test: %s", raw_type); if (test_type == TEST_HUGETLB) - page_size = hpage_size; + page_size = default_huge_page_size(); else page_size = sysconf(_SC_PAGE_SIZE); @@ -1035,6 +1035,7 @@ static void sigalrm(int sig) int main(int argc, char **argv) { size_t bytes; + size_t hpage_size = read_pmd_pagesize(); if (argc < 4) usage(); @@ -1043,7 +1044,6 @@ int main(int argc, char **argv) err("failed to arm SIGALRM"); alarm(ALARM_INTERVAL_SECS); - hpage_size = default_huge_page_size(); parse_test_type_arg(argv[1]); bytes = atol(argv[2]) * 1024 * 1024; -- cgit v1.2.3 From 508340845dd1423b6c81fccf082039dff202fb9f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:43:37 -0400 Subject: selftests/mm: rename uffd_stats to uffd_args Prepare for adding more fields into the struct. Link: https://lkml.kernel.org/r/20230412164337.328607-1-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Mike Rapoport (IBM) Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 28 +++++++++--------- tools/testing/selftests/mm/uffd-common.h | 6 ++-- tools/testing/selftests/mm/uffd-stress.c | 51 ++++++++++++++++---------------- 3 files changed, 42 insertions(+), 43 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index f02dfcf10714..e746405aa8f3 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -186,34 +186,34 @@ struct uffd_test_ops hugetlb_uffd_test_ops = { .check_pmd_mapping = NULL, }; -void uffd_stats_report(struct uffd_stats *stats, int n_cpus) +void uffd_stats_report(struct uffd_args *args, int n_cpus) { int i; unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; for (i = 0; i < n_cpus; i++) { - miss_total += stats[i].missing_faults; - wp_total += stats[i].wp_faults; - minor_total += stats[i].minor_faults; + miss_total += args[i].missing_faults; + wp_total += args[i].wp_faults; + minor_total += args[i].minor_faults; } printf("userfaults: "); if (miss_total) { printf("%llu missing (", miss_total); for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].missing_faults); + printf("%lu+", args[i].missing_faults); printf("\b) "); } if (wp_total) { printf("%llu wp (", wp_total); for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].wp_faults); + printf("%lu+", args[i].wp_faults); printf("\b) "); } if (minor_total) { printf("%llu minor (", minor_total); for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].minor_faults); + printf("%lu+", args[i].minor_faults); printf("\b)"); } printf("\n"); @@ -397,7 +397,7 @@ int uffd_read_msg(int ufd, struct uffd_msg *msg) return 0; } -void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats) +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) { unsigned long offset; @@ -407,7 +407,7 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats) if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { /* Write protect page faults */ wp_range(uffd, msg->arg.pagefault.address, page_size, false); - stats->wp_faults++; + args->wp_faults++; } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { uint8_t *area; int b; @@ -430,7 +430,7 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats) for (b = 0; b < page_size; ++b) area[b] = ~area[b]; continue_range(uffd, msg->arg.pagefault.address, page_size); - stats->minor_faults++; + args->minor_faults++; } else { /* * Missing page faults. @@ -460,14 +460,14 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats) offset &= ~(page_size-1); if (copy_page(uffd, offset)) - stats->missing_faults++; + args->missing_faults++; } } void *uffd_poll_thread(void *arg) { - struct uffd_stats *stats = (struct uffd_stats *)arg; - unsigned long cpu = stats->cpu; + struct uffd_args *args = (struct uffd_args *)arg; + unsigned long cpu = args->cpu; struct pollfd pollfd[2]; struct uffd_msg msg; struct uffdio_register uffd_reg; @@ -502,7 +502,7 @@ void *uffd_poll_thread(void *arg) err("unexpected msg event %u\n", msg.event); break; case UFFD_EVENT_PAGEFAULT: - uffd_handle_page_fault(&msg, stats); + uffd_handle_page_fault(&msg, args); break; case UFFD_EVENT_FORK: close(uffd); diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 47565b2f2dee..f8d2ad178827 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -70,7 +70,7 @@ - 1))) /* Userfaultfd test statistics */ -struct uffd_stats { +struct uffd_args { int cpu; unsigned long missing_faults; unsigned long wp_faults; @@ -98,12 +98,12 @@ extern uffd_test_ops_t shmem_uffd_test_ops; extern uffd_test_ops_t hugetlb_uffd_test_ops; extern uffd_test_ops_t *uffd_test_ops; -void uffd_stats_report(struct uffd_stats *stats, int n_cpus); +void uffd_stats_report(struct uffd_args *args, int n_cpus); void uffd_test_ctx_init(uint64_t features); void userfaultfd_open(uint64_t *features); int uffd_read_msg(int ufd, struct uffd_msg *msg); void wp_range(int ufd, __u64 start, __u64 len, bool wp); -void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats); +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args); int __copy_page(int ufd, unsigned long offset, bool retry); int copy_page(int ufd, unsigned long offset); void *uffd_poll_thread(void *arg); diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 54fc9b4ffa3c..ce7251ab93ef 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -90,16 +90,15 @@ static void usage(void) exit(1); } -static void uffd_stats_reset(struct uffd_stats *uffd_stats, - unsigned long n_cpus) +static void uffd_stats_reset(struct uffd_args *args, unsigned long n_cpus) { int i; for (i = 0; i < n_cpus; i++) { - uffd_stats[i].cpu = i; - uffd_stats[i].missing_faults = 0; - uffd_stats[i].wp_faults = 0; - uffd_stats[i].minor_faults = 0; + args[i].cpu = i; + args[i].missing_faults = 0; + args[i].wp_faults = 0; + args[i].minor_faults = 0; } } @@ -163,7 +162,7 @@ pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; static void *uffd_read_thread(void *arg) { - struct uffd_stats *stats = (struct uffd_stats *)arg; + struct uffd_args *args = (struct uffd_args *)arg; struct uffd_msg msg; pthread_mutex_unlock(&uffd_read_mutex); @@ -172,7 +171,7 @@ static void *uffd_read_thread(void *arg) for (;;) { if (uffd_read_msg(uffd, &msg)) continue; - uffd_handle_page_fault(&msg, stats); + uffd_handle_page_fault(&msg, args); } return NULL; @@ -210,7 +209,7 @@ static void *background_thread(void *arg) return NULL; } -static int stress(struct uffd_stats *uffd_stats) +static int stress(struct uffd_args *args) { unsigned long cpu; pthread_t locking_threads[nr_cpus]; @@ -225,12 +224,12 @@ static int stress(struct uffd_stats *uffd_stats) if (bounces & BOUNCE_POLL) { if (pthread_create(&uffd_threads[cpu], &attr, uffd_poll_thread, - (void *)&uffd_stats[cpu])) + (void *)&args[cpu])) return 1; } else { if (pthread_create(&uffd_threads[cpu], &attr, uffd_read_thread, - (void *)&uffd_stats[cpu])) + (void *)&args[cpu])) return 1; pthread_mutex_lock(&uffd_read_mutex); } @@ -264,7 +263,7 @@ static int stress(struct uffd_stats *uffd_stats) if (write(pipefd[cpu*2+1], &c, 1) != 1) err("pipefd write error"); if (pthread_join(uffd_threads[cpu], - (void *)&uffd_stats[cpu])) + (void *)&args[cpu])) return 1; } else { if (pthread_cancel(uffd_threads[cpu])) @@ -493,7 +492,7 @@ static int userfaultfd_events_test(void) int err, features; pid_t pid; char c; - struct uffd_stats stats = { 0 }; + struct uffd_args args = { 0 }; printf("testing events (fork, remap, remove): "); fflush(stdout); @@ -508,7 +507,7 @@ static int userfaultfd_events_test(void) true, test_uffdio_wp, false)) err("register failure"); - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &args)) err("uffd_poll_thread create"); pid = fork(); @@ -526,9 +525,9 @@ static int userfaultfd_events_test(void) if (pthread_join(uffd_mon, NULL)) return 1; - uffd_stats_report(&stats, 1); + uffd_stats_report(&args, 1); - return stats.missing_faults != nr_pages; + return args.missing_faults != nr_pages; } static int userfaultfd_sig_test(void) @@ -538,7 +537,7 @@ static int userfaultfd_sig_test(void) int err, features; pid_t pid; char c; - struct uffd_stats stats = { 0 }; + struct uffd_args args = { 0 }; printf("testing signal delivery: "); fflush(stdout); @@ -557,7 +556,7 @@ static int userfaultfd_sig_test(void) uffd_test_ops->release_pages(area_dst); - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &args)) err("uffd_poll_thread create"); pid = fork(); @@ -606,7 +605,7 @@ static int userfaultfd_minor_test(void) unsigned long p; pthread_t uffd_mon; char c; - struct uffd_stats stats = { 0 }; + struct uffd_args args = { 0 }; if (!test_uffdio_minor) return 0; @@ -629,7 +628,7 @@ static int userfaultfd_minor_test(void) page_size); } - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &args)) err("uffd_poll_thread create"); /* @@ -645,7 +644,7 @@ static int userfaultfd_minor_test(void) if (pthread_join(uffd_mon, NULL)) return 1; - uffd_stats_report(&stats, 1); + uffd_stats_report(&args, 1); if (test_collapse) { printf("testing collapse of uffd memory into PMD-mapped THPs:"); @@ -664,7 +663,7 @@ static int userfaultfd_minor_test(void) printf(" done.\n"); } - return stats.missing_faults != 0 || stats.minor_faults != nr_pages; + return args.missing_faults != 0 || args.minor_faults != nr_pages; } static int pagemap_open(void) @@ -822,7 +821,7 @@ static int userfaultfd_stress(void) { void *area; unsigned long nr; - struct uffd_stats uffd_stats[nr_cpus]; + struct uffd_args args[nr_cpus]; uint64_t mem_size = nr_pages * page_size; uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); @@ -894,10 +893,10 @@ static int userfaultfd_stress(void) */ uffd_test_ops->release_pages(area_dst); - uffd_stats_reset(uffd_stats, nr_cpus); + uffd_stats_reset(args, nr_cpus); /* bounce pass */ - if (stress(uffd_stats)) + if (stress(args)) return 1; /* Clear all the write protections if there is any */ @@ -926,7 +925,7 @@ static int userfaultfd_stress(void) swap(area_src_alias, area_dst_alias); - uffd_stats_report(uffd_stats, nr_cpus); + uffd_stats_report(args, nr_cpus); } if (test_type == TEST_ANON) { -- cgit v1.2.3 From 0210c43ef623a3c35cafd7dbe25b1b3c1699e7e9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:43:41 -0400 Subject: selftests/mm: let uffd_handle_page_fault() take wp parameter Make the handler optionally apply WP bit when resolving page faults for either missing or minor page faults. This moves towards removing global test_uffdio_wp outside of the common code. Link: https://lkml.kernel.org/r/20230412164341.328618-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 17 +++++++++-------- tools/testing/selftests/mm/uffd-common.h | 6 ++++-- tools/testing/selftests/mm/uffd-stress.c | 16 ++++++++++------ 3 files changed, 23 insertions(+), 16 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index e746405aa8f3..daa2a95408e6 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -353,7 +353,7 @@ void wp_range(int ufd, __u64 start, __u64 len, bool wp) err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); } -static void continue_range(int ufd, __u64 start, __u64 len) +static void continue_range(int ufd, __u64 start, __u64 len, bool wp) { struct uffdio_continue req; int ret; @@ -361,7 +361,7 @@ static void continue_range(int ufd, __u64 start, __u64 len) req.range.start = start; req.range.len = len; req.mode = 0; - if (test_uffdio_wp) + if (wp) req.mode |= UFFDIO_CONTINUE_MODE_WP; if (ioctl(ufd, UFFDIO_CONTINUE, &req)) @@ -429,7 +429,8 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) area_dst_alias)); for (b = 0; b < page_size; ++b) area[b] = ~area[b]; - continue_range(uffd, msg->arg.pagefault.address, page_size); + continue_range(uffd, msg->arg.pagefault.address, page_size, + args->apply_wp); args->minor_faults++; } else { /* @@ -459,7 +460,7 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; offset &= ~(page_size-1); - if (copy_page(uffd, offset)) + if (copy_page(uffd, offset, args->apply_wp)) args->missing_faults++; } } @@ -555,7 +556,7 @@ static void wake_range(int ufd, unsigned long addr, unsigned long len) addr), exit(1); } -int __copy_page(int ufd, unsigned long offset, bool retry) +int __copy_page(int ufd, unsigned long offset, bool retry, bool wp) { struct uffdio_copy uffdio_copy; @@ -564,7 +565,7 @@ int __copy_page(int ufd, unsigned long offset, bool retry) uffdio_copy.dst = (unsigned long) area_dst + offset; uffdio_copy.src = (unsigned long) area_src + offset; uffdio_copy.len = page_size; - if (test_uffdio_wp) + if (wp) uffdio_copy.mode = UFFDIO_COPY_MODE_WP; else uffdio_copy.mode = 0; @@ -587,7 +588,7 @@ int __copy_page(int ufd, unsigned long offset, bool retry) return 0; } -int copy_page(int ufd, unsigned long offset) +int copy_page(int ufd, unsigned long offset, bool wp) { - return __copy_page(ufd, offset, false); + return __copy_page(ufd, offset, false, wp); } diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index f8d2ad178827..0ec07d025cfe 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -72,6 +72,8 @@ /* Userfaultfd test statistics */ struct uffd_args { int cpu; + /* Whether apply wr-protects when installing pages */ + bool apply_wp; unsigned long missing_faults; unsigned long wp_faults; unsigned long minor_faults; @@ -104,8 +106,8 @@ void userfaultfd_open(uint64_t *features); int uffd_read_msg(int ufd, struct uffd_msg *msg); void wp_range(int ufd, __u64 start, __u64 len, bool wp); void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args); -int __copy_page(int ufd, unsigned long offset, bool retry); -int copy_page(int ufd, unsigned long offset); +int __copy_page(int ufd, unsigned long offset, bool retry, bool wp); +int copy_page(int ufd, unsigned long offset, bool wp); void *uffd_poll_thread(void *arg); #define TEST_ANON 1 diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index ce7251ab93ef..747d588c0d69 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -96,6 +96,7 @@ static void uffd_stats_reset(struct uffd_args *args, unsigned long n_cpus) for (i = 0; i < n_cpus; i++) { args[i].cpu = i; + args[i].apply_wp = test_uffdio_wp; args[i].missing_faults = 0; args[i].wp_faults = 0; args[i].minor_faults = 0; @@ -155,7 +156,7 @@ static void *locking_thread(void *arg) static int copy_page_retry(int ufd, unsigned long offset) { - return __copy_page(ufd, offset, true); + return __copy_page(ufd, offset, true, test_uffdio_wp); } pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -308,7 +309,7 @@ static void sighndl(int sig, siginfo_t *siginfo, void *ptr) * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal * feature. Using monitor thread, verify no userfault events are generated. */ -static int faulting_process(int signal_test) +static int faulting_process(int signal_test, bool wp) { unsigned long nr; unsigned long long count; @@ -343,7 +344,7 @@ static int faulting_process(int signal_test) if (steps == 1) { /* This is a MISSING request */ steps++; - if (copy_page(uffd, offset)) + if (copy_page(uffd, offset, wp)) signalled++; } else { /* This is a WP request */ @@ -507,6 +508,7 @@ static int userfaultfd_events_test(void) true, test_uffdio_wp, false)) err("register failure"); + args.apply_wp = test_uffdio_wp; if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &args)) err("uffd_poll_thread create"); @@ -515,7 +517,7 @@ static int userfaultfd_events_test(void) err("fork"); if (!pid) - exit(faulting_process(0)); + exit(faulting_process(0, test_uffdio_wp)); waitpid(pid, &err, 0); if (err) @@ -551,11 +553,12 @@ static int userfaultfd_sig_test(void) true, test_uffdio_wp, false)) err("register failure"); - if (faulting_process(1)) + if (faulting_process(1, test_uffdio_wp)) err("faulting process failed"); uffd_test_ops->release_pages(area_dst); + args.apply_wp = test_uffdio_wp; if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &args)) err("uffd_poll_thread create"); @@ -564,7 +567,7 @@ static int userfaultfd_sig_test(void) err("fork"); if (!pid) - exit(faulting_process(2)); + exit(faulting_process(2, test_uffdio_wp)); waitpid(pid, &err, 0); if (err) @@ -628,6 +631,7 @@ static int userfaultfd_minor_test(void) page_size); } + args.apply_wp = test_uffdio_wp; if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &args)) err("uffd_poll_thread create"); -- cgit v1.2.3 From be39fec4f97f01329ef48a5a3836f592f6a82766 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:43:45 -0400 Subject: selftests/mm: allow allocate_area() to fail properly Mostly to detect hugetlb allocation errors and skip hugetlb tests when pages are not allocated. Link: https://lkml.kernel.org/r/20230412164345.328659-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 47 +++++++++++++++++++++++--------- tools/testing/selftests/mm/uffd-common.h | 4 +-- 2 files changed, 36 insertions(+), 15 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index daa2a95408e6..bc6c5c38d6dd 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -44,10 +44,15 @@ static void anon_release_pages(char *rel_area) err("madvise(MADV_DONTNEED) failed"); } -static void anon_allocate_area(void **alloc_area, bool is_src) +static int anon_allocate_area(void **alloc_area, bool is_src) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (*alloc_area == MAP_FAILED) { + *alloc_area = NULL; + return -errno; + } + return 0; } static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -65,7 +70,7 @@ static void hugetlb_release_pages(char *rel_area) } } -static void hugetlb_allocate_area(void **alloc_area, bool is_src) +static int hugetlb_allocate_area(void **alloc_area, bool is_src) { off_t size = nr_pages * page_size; off_t offset = is_src ? 0 : size; @@ -77,14 +82,16 @@ static void hugetlb_allocate_area(void **alloc_area, bool is_src) (map_shared ? MAP_SHARED : MAP_PRIVATE) | (is_src ? 0 : MAP_NORESERVE), mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of hugetlbfs file failed"); + if (*alloc_area == MAP_FAILED) { + *alloc_area = NULL; + return -errno; + } if (map_shared) { area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, mem_fd, offset); if (area_alias == MAP_FAILED) - err("mmap of hugetlb file alias failed"); + return -errno; } if (is_src) { @@ -96,6 +103,7 @@ static void hugetlb_allocate_area(void **alloc_area, bool is_src) *alloc_area_alias = area_alias; close(mem_fd); + return 0; } static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -112,7 +120,7 @@ static void shmem_release_pages(char *rel_area) err("madvise(MADV_REMOVE) failed"); } -static void shmem_allocate_area(void **alloc_area, bool is_src) +static int shmem_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize(); @@ -132,15 +140,20 @@ static void shmem_allocate_area(void **alloc_area, bool is_src) *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of memfd failed"); + if (*alloc_area == MAP_FAILED) { + *alloc_area = NULL; + return -errno; + } if (test_collapse && *alloc_area != p) err("mmap of memfd failed at %p", p); area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of memfd alias failed"); + if (area_alias == MAP_FAILED) { + munmap(*alloc_area, bytes); + *alloc_area = NULL; + return -errno; + } if (test_collapse && area_alias != p_alias) err("mmap of anonymous memory failed at %p", p_alias); @@ -150,6 +163,7 @@ static void shmem_allocate_area(void **alloc_area, bool is_src) area_dst_alias = area_alias; close(mem_fd); + return 0; } static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -282,14 +296,19 @@ static void uffd_test_ctx_clear(void) munmap_area((void **)&area_remap); } -void uffd_test_ctx_init(uint64_t features) +int uffd_test_ctx_init(uint64_t features) { unsigned long nr, cpu; + int ret; uffd_test_ctx_clear(); - uffd_test_ops->allocate_area((void **)&area_src, true); - uffd_test_ops->allocate_area((void **)&area_dst, false); + ret = uffd_test_ops->allocate_area((void **)&area_src, true); + if (ret) + return ret; + ret = uffd_test_ops->allocate_area((void **)&area_dst, false); + if (ret) + return ret; userfaultfd_open(&features); @@ -337,6 +356,8 @@ void uffd_test_ctx_init(uint64_t features) for (cpu = 0; cpu < nr_cpus; cpu++) if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) err("pipe"); + + return 0; } void wp_range(int ufd, __u64 start, __u64 len, bool wp) diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 0ec07d025cfe..9479a0649d7f 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -80,7 +80,7 @@ struct uffd_args { }; struct uffd_test_ops { - void (*allocate_area)(void **alloc_area, bool is_src); + int (*allocate_area)(void **alloc_area, bool is_src); void (*release_pages)(char *rel_area); void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); void (*check_pmd_mapping)(void *p, int expect_nr_hpages); @@ -101,7 +101,7 @@ extern uffd_test_ops_t hugetlb_uffd_test_ops; extern uffd_test_ops_t *uffd_test_ops; void uffd_stats_report(struct uffd_args *args, int n_cpus); -void uffd_test_ctx_init(uint64_t features); +int uffd_test_ctx_init(uint64_t features); void userfaultfd_open(uint64_t *features); int uffd_read_msg(int ufd, struct uffd_msg *msg); void wp_range(int ufd, __u64 start, __u64 len, bool wp); -- cgit v1.2.3 From 16a45b57cbf2312215fbac79df4b58a0d70e1f2b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:43:48 -0400 Subject: selftests/mm: add framework for uffd-unit-test Add a framework to be prepared to move unit tests from uffd-stress.c into uffd-unit-tests.c. The goal is to allow detection of uffd features for each test, and also loop over specified types of memory that a test support. Link: https://lkml.kernel.org/r/20230412164348.328710-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 124 +++++++++++++++++++++++++++ tools/testing/selftests/mm/vm_util.c | 37 ++++++++ tools/testing/selftests/mm/vm_util.h | 2 + 3 files changed, 163 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index bb492c258486..936b24a6f468 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -9,6 +9,66 @@ #ifdef __NR_userfaultfd +/* The unit test doesn't need a large or random size, make it 32MB for now */ +#define UFFD_TEST_MEM_SIZE (32UL << 20) + +#define MEM_ANON BIT_ULL(0) +#define MEM_SHMEM BIT_ULL(1) +#define MEM_SHMEM_PRIVATE BIT_ULL(2) +#define MEM_HUGETLB BIT_ULL(3) +#define MEM_HUGETLB_PRIVATE BIT_ULL(4) + +struct mem_type { + const char *name; + unsigned int mem_flag; + uffd_test_ops_t *mem_ops; + bool shared; +}; +typedef struct mem_type mem_type_t; + +mem_type_t mem_types[] = { + { + .name = "anon", + .mem_flag = MEM_ANON, + .mem_ops = &anon_uffd_test_ops, + .shared = false, + }, + { + .name = "shmem", + .mem_flag = MEM_SHMEM, + .mem_ops = &shmem_uffd_test_ops, + .shared = true, + }, + { + .name = "shmem-private", + .mem_flag = MEM_SHMEM_PRIVATE, + .mem_ops = &shmem_uffd_test_ops, + .shared = false, + }, + { + .name = "hugetlb", + .mem_flag = MEM_HUGETLB, + .mem_ops = &hugetlb_uffd_test_ops, + .shared = true, + }, + { + .name = "hugetlb-private", + .mem_flag = MEM_HUGETLB_PRIVATE, + .mem_ops = &hugetlb_uffd_test_ops, + .shared = false, + }, +}; + +/* Returns: UFFD_TEST_* */ +typedef void (*uffd_test_fn)(void); + +typedef struct { + const char *name; + uffd_test_fn uffd_fn; + unsigned int mem_targets; + uint64_t uffd_feature_required; +} uffd_test_case_t; + static void uffd_test_report(void) { printf("Userfaults unit tests: pass=%u, skip=%u, fail=%u (total=%u)\n", @@ -105,9 +165,50 @@ out: return 1; } +/* + * This function initializes the global variables. TODO: remove global + * vars and then remove this. + */ +static int uffd_setup_environment(uffd_test_case_t *test, mem_type_t *mem_type) +{ + map_shared = mem_type->shared; + uffd_test_ops = mem_type->mem_ops; + + if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) + page_size = default_huge_page_size(); + else + page_size = psize(); + + nr_pages = UFFD_TEST_MEM_SIZE / page_size; + /* TODO: remove this global var.. it's so ugly */ + nr_cpus = 1; + + return uffd_test_ctx_init(test->uffd_feature_required); +} + +static bool uffd_feature_supported(uffd_test_case_t *test) +{ + uint64_t features; + + if (uffd_get_features(&features)) + return false; + + return (features & test->uffd_feature_required) == + test->uffd_feature_required; +} + +uffd_test_case_t uffd_tests[] = { +}; + int main(int argc, char *argv[]) { + int n_tests = sizeof(uffd_tests) / sizeof(uffd_test_case_t); + int n_mems = sizeof(mem_types) / sizeof(mem_type_t); + uffd_test_case_t *test; + mem_type_t *mem_type; + char test_name[128]; int has_uffd; + int i, j; has_uffd = test_uffd_api(false); has_uffd |= test_uffd_api(true); @@ -116,6 +217,29 @@ int main(int argc, char *argv[]) printf("Userfaultfd not supported or unprivileged, skip all tests\n"); exit(KSFT_SKIP); } + + for (i = 0; i < n_tests; i++) { + test = &uffd_tests[i]; + for (j = 0; j < n_mems; j++) { + mem_type = &mem_types[j]; + if (!(test->mem_targets & mem_type->mem_flag)) + continue; + snprintf(test_name, sizeof(test_name), + "%s on %s", test->name, mem_type->name); + + uffd_test_start(test_name); + if (!uffd_feature_supported(test)) { + uffd_test_skip("feature missing"); + continue; + } + if (uffd_setup_environment(test, mem_type)) { + uffd_test_skip("environment setup failed"); + continue; + } + test->uffd_fn(); + } + } + uffd_test_report(); return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS; diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 5ee6c4688a7c..1bc0ceb01adb 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -254,3 +254,40 @@ int uffd_open_sys(unsigned int flags) return -1; #endif } + +int uffd_open(unsigned int flags) +{ + int uffd = uffd_open_sys(flags); + + if (uffd < 0) + uffd = uffd_open_dev(flags); + + return uffd; +} + +int uffd_get_features(uint64_t *features) +{ + struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 }; + /* + * This should by default work in most kernels; the feature list + * will be the same no matter what we pass in here. + */ + int fd = uffd_open(UFFD_USER_MODE_ONLY); + + if (fd < 0) + /* Maybe the kernel is older than user-only mode? */ + fd = uffd_open(0); + + if (fd < 0) + return fd; + + if (ioctl(fd, UFFDIO_API, &uffdio_api)) { + close(fd); + return -errno; + } + + *features = uffdio_api.features; + close(fd); + + return 0; +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 481354141533..634eb2f41145 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -50,6 +50,8 @@ int uffd_register(int uffd, void *addr, uint64_t len, int uffd_unregister(int uffd, void *addr, uint64_t len); int uffd_open_dev(unsigned int flags); int uffd_open_sys(unsigned int flags); +int uffd_open(unsigned int flags); +int uffd_get_features(uint64_t *features); /* * On ppc64 this will only work with radix 2M hugepage size -- cgit v1.2.3 From 8bda424fca0a0cc056c49a2500df03590648d029 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:43:52 -0400 Subject: selftests/mm: move uffd pagemap test to unit test Move it over and make it split into two tests, one for pagemap and one for the new WP_UNPOPULATED (to be a separate one). The thp pagemap test wasn't really working (with MADV_HUGEPAGE). Let's just drop it (since it never really worked anyway..) and leave that for later. Link: https://lkml.kernel.org/r/20230412164352.328733-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Rapoport (IBM) Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 166 --------------------------- tools/testing/selftests/mm/uffd-unit-tests.c | 145 +++++++++++++++++++++++ 2 files changed, 145 insertions(+), 166 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 747d588c0d69..61d025d87bf2 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -670,157 +670,6 @@ static int userfaultfd_minor_test(void) return args.missing_faults != 0 || args.minor_faults != nr_pages; } -static int pagemap_open(void) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - - if (fd < 0) - err("open pagemap"); - - return fd; -} - -/* This macro let __LINE__ works in err() */ -#define pagemap_check_wp(value, wp) do { \ - if (!!(value & PM_UFFD_WP) != wp) \ - err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ - } while (0) - -static int pagemap_test_fork(bool present) -{ - pid_t child = fork(); - uint64_t value; - int fd, result; - - if (!child) { - /* Open the pagemap fd of the child itself */ - fd = pagemap_open(); - value = pagemap_get_entry(fd, area_dst); - /* - * After fork() uffd-wp bit should be gone as long as we're - * without UFFD_FEATURE_EVENT_FORK - */ - pagemap_check_wp(value, false); - /* Succeed */ - exit(0); - } - waitpid(child, &result, 0); - return result; -} - -static void userfaultfd_wp_unpopulated_test(int pagemap_fd) -{ - uint64_t value; - - /* Test applying pte marker to anon unpopulated */ - wp_range(uffd, (uint64_t)area_dst, page_size, true); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - - /* Test unprotect on anon pte marker */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Test zap on anon marker */ - wp_range(uffd, (uint64_t)area_dst, page_size, true); - if (madvise(area_dst, page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Test fault in after marker removed */ - *area_dst = 1; - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - /* Drop it to make pte none again */ - if (madvise(area_dst, page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - - /* Test read-zero-page upon pte marker */ - wp_range(uffd, (uint64_t)area_dst, page_size, true); - *(volatile char *)area_dst; - /* Drop it to make pte none again */ - if (madvise(area_dst, page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); -} - -static void userfaultfd_pagemap_test(unsigned int test_pgsize) -{ - int pagemap_fd; - uint64_t value; - - /* Pagemap tests uffd-wp only */ - if (!test_uffdio_wp) - return; - - /* Not enough memory to test this page size */ - if (test_pgsize > nr_pages * page_size) - return; - - printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); - /* Flush so it doesn't flush twice in parent/child later */ - fflush(stdout); - - uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); - - if (test_pgsize > page_size) { - /* This is a thp test */ - if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) - err("madvise(MADV_HUGEPAGE) failed"); - } else if (test_pgsize == page_size) { - /* This is normal page test; force no thp */ - if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) - err("madvise(MADV_NOHUGEPAGE) failed"); - } - - if (uffd_register(uffd, area_dst, nr_pages * page_size, - false, true, false)) - err("register failed"); - - pagemap_fd = pagemap_open(); - - /* Smoke test WP_UNPOPULATED first when it's still empty */ - if (test_pgsize == page_size) - userfaultfd_wp_unpopulated_test(pagemap_fd); - - /* Touch the page */ - *area_dst = 1; - wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(true)) - err("Detected stall uffd-wp bit in child"); - - /* Exclusive required or PAGEOUT won't work */ - if (!(value & PM_MMAP_EXCLUSIVE)) - err("multiple mapping detected: 0x%"PRIx64, value); - - if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) - err("madvise(MADV_PAGEOUT) failed"); - - /* Uffd-wp should persist even swapped out */ - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(false)) - err("Detected stall uffd-wp bit in child"); - - /* Unprotect; this tests swap pte modifications */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Fault in the page from disk */ - *area_dst = 2; - value = pagemap_get_entry(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - close(pagemap_fd); - printf("done\n"); -} - static int userfaultfd_stress(void) { void *area; @@ -932,21 +781,6 @@ static int userfaultfd_stress(void) uffd_stats_report(args, nr_cpus); } - if (test_type == TEST_ANON) { - /* - * shmem/hugetlb won't be able to run since they have different - * behavior on fork() (file-backed memory normally drops ptes - * directly when fork), meanwhile the pagemap test will verify - * pgtable entry of fork()ed child. - */ - userfaultfd_pagemap_test(page_size); - /* - * Hard-code for x86_64 for now for 2M THP, as x86_64 is - * currently the only one that supports uffd-wp - */ - userfaultfd_pagemap_test(page_size * 512); - } - return userfaultfd_zeropage_test() || userfaultfd_sig_test() || userfaultfd_events_test() || userfaultfd_minor_test(); } diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 936b24a6f468..4690c95a9420 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -197,7 +197,152 @@ static bool uffd_feature_supported(uffd_test_case_t *test) test->uffd_feature_required; } +static int pagemap_open(void) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) + err("open pagemap"); + + return fd; +} + +/* This macro let __LINE__ works in err() */ +#define pagemap_check_wp(value, wp) do { \ + if (!!(value & PM_UFFD_WP) != wp) \ + err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ + } while (0) + +static int pagemap_test_fork(bool present) +{ + pid_t child = fork(); + uint64_t value; + int fd, result; + + if (!child) { + /* Open the pagemap fd of the child itself */ + fd = pagemap_open(); + value = pagemap_get_entry(fd, area_dst); + /* + * After fork() uffd-wp bit should be gone as long as we're + * without UFFD_FEATURE_EVENT_FORK + */ + pagemap_check_wp(value, false); + /* Succeed */ + exit(0); + } + waitpid(child, &result, 0); + return result; +} + +static void uffd_wp_unpopulated_test(void) +{ + uint64_t value; + int pagemap_fd; + + if (uffd_register(uffd, area_dst, nr_pages * page_size, + false, true, false)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Test applying pte marker to anon unpopulated */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + + /* Test unprotect on anon pte marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Test zap on anon marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Test fault in after marker removed */ + *area_dst = 1; + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + /* Drop it to make pte none again */ + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + + /* Test read-zero-page upon pte marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + *(volatile char *)area_dst; + /* Drop it to make pte none again */ + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + + uffd_test_pass(); +} + +static void uffd_pagemap_test(void) +{ + int pagemap_fd; + uint64_t value; + + if (uffd_register(uffd, area_dst, nr_pages * page_size, + false, true, false)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, page_size, true); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(true)) + err("Detected stall uffd-wp bit in child"); + + /* Exclusive required or PAGEOUT won't work */ + if (!(value & PM_MMAP_EXCLUSIVE)) + err("multiple mapping detected: 0x%"PRIx64, value); + + if (madvise(area_dst, page_size, MADV_PAGEOUT)) + err("madvise(MADV_PAGEOUT) failed"); + + /* Uffd-wp should persist even swapped out */ + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(false)) + err("Detected stall uffd-wp bit in child"); + + /* Unprotect; this tests swap pte modifications */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Fault in the page from disk */ + *area_dst = 2; + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + close(pagemap_fd); + uffd_test_pass(); +} + uffd_test_case_t uffd_tests[] = { + { + .name = "pagemap", + .uffd_fn = uffd_pagemap_test, + .mem_targets = MEM_ANON, + .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP, + }, + { + .name = "wp-unpopulated", + .uffd_fn = uffd_wp_unpopulated_test, + .mem_targets = MEM_ANON, + .uffd_feature_required = + UFFD_FEATURE_PAGEFAULT_FLAG_WP | UFFD_FEATURE_WP_UNPOPULATED, + }, }; int main(int argc, char *argv[]) -- cgit v1.2.3 From 62515b5f9fdabf740efd28d5746c219f1b2e75cc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:43:57 -0400 Subject: selftests/mm: move uffd minor test to unit test This moves the minor test to the new unit test. Rewrite the content check with char* opeartions to avoid fiddling with my_bcmp(). Drop global vars test_uffdio_minor and test_collapse, just assume test them always in common code for now. OTOH make this single test into five tests: - minor test on [shmem, hugetlb] with wp=false - minor test on [shmem, hugetlb] with wp=true - minor test + collapse on shmem only One thing to mention that we used to test COLLAPSE+WP but that doesn't sound right at all. It's possible it's silently broken but unnoticed because COLLAPSE is not part of the default test suite. Make the MADV_COLLAPSE test fail-able (by skip it when failing), because it's not guaranteed to success anyway. Drop a bunch of useless code after the move, because the unit test always use aligned num of pages and has nothing to do with n_cpus. Link: https://lkml.kernel.org/r/20230412164357.328779-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Zach O'Keefe Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 25 +++-- tools/testing/selftests/mm/uffd-common.h | 4 +- tools/testing/selftests/mm/uffd-stress.c | 131 +-------------------------- tools/testing/selftests/mm/uffd-unit-tests.c | 120 ++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 145 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index bc6c5c38d6dd..12ac84712a38 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -13,8 +13,8 @@ volatile bool test_uffdio_copy_eexist = true; unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; int uffd = -1, uffd_flags, finished, *pipefd, test_type; -bool map_shared, test_collapse, test_dev_userfaultfd; -bool test_uffdio_wp = true, test_uffdio_minor = false; +bool map_shared, test_dev_userfaultfd; +bool test_uffdio_wp = true; unsigned long long *count_verify; uffd_test_ops_t *uffd_test_ops; @@ -128,15 +128,14 @@ static int shmem_allocate_area(void **alloc_area, bool is_src) char *p = NULL, *p_alias = NULL; int mem_fd = uffd_mem_fd_create(bytes * 2, false); - if (test_collapse) { - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); - p_alias = p; - p_alias += bytes; - p_alias += hpage_size; /* Prevent src/dst VMA merge */ - } + /* TODO: clean this up. Use a static addr is ugly */ + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, mem_fd, offset); @@ -144,7 +143,7 @@ static int shmem_allocate_area(void **alloc_area, bool is_src) *alloc_area = NULL; return -errno; } - if (test_collapse && *alloc_area != p) + if (*alloc_area != p) err("mmap of memfd failed at %p", p); area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, @@ -154,7 +153,7 @@ static int shmem_allocate_area(void **alloc_area, bool is_src) *alloc_area = NULL; return -errno; } - if (test_collapse && area_alias != p_alias) + if (area_alias != p_alias) err("mmap of anonymous memory failed at %p", p_alias); if (is_src) diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 9479a0649d7f..4bd5915cf5b4 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -90,8 +90,8 @@ typedef struct uffd_test_ops uffd_test_ops_t; extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; extern int uffd, uffd_flags, finished, *pipefd, test_type; -extern bool map_shared, test_collapse, test_dev_userfaultfd; -extern bool test_uffdio_wp, test_uffdio_minor; +extern bool map_shared, test_dev_userfaultfd; +extern bool test_uffdio_wp; extern unsigned long long *count_verify; extern volatile bool test_uffdio_copy_eexist; diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 61d025d87bf2..f9322bbaf825 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -52,8 +52,6 @@ pthread_attr_t attr; #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) -#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) - const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" @@ -79,8 +77,6 @@ static void usage(void) "Supported mods:\n"); fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); - fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" - "memory\n"); fprintf(stderr, "\nExample test mod usage:\n"); fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); @@ -584,92 +580,6 @@ static int userfaultfd_sig_test(void) return userfaults != 0; } -void check_memory_contents(char *p) -{ - unsigned long i; - uint8_t expected_byte; - void *expected_page; - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (i = 0; i < nr_pages; ++i) { - expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, p + (i * page_size), page_size)) - err("unexpected page contents after minor fault"); - } - - free(expected_page); -} - -static int userfaultfd_minor_test(void) -{ - unsigned long p; - pthread_t uffd_mon; - char c; - struct uffd_args args = { 0 }; - - if (!test_uffdio_minor) - return 0; - - printf("testing minor faults: "); - fflush(stdout); - - uffd_test_ctx_init(uffd_minor_feature()); - - if (uffd_register(uffd, area_dst_alias, nr_pages * page_size, - false, test_uffdio_wp, true)) - err("register failure"); - - /* - * After registering with UFFD, populate the non-UFFD-registered side of - * the shared mapping. This should *not* trigger any UFFD minor faults. - */ - for (p = 0; p < nr_pages; ++p) { - memset(area_dst + (p * page_size), p % ((uint8_t)-1), - page_size); - } - - args.apply_wp = test_uffdio_wp; - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &args)) - err("uffd_poll_thread create"); - - /* - * Read each of the pages back using the UFFD-registered mapping. We - * expect that the first time we touch a page, it will result in a minor - * fault. uffd_poll_thread will resolve the fault by bit-flipping the - * page's contents, and then issuing a CONTINUE ioctl. - */ - check_memory_contents(area_dst_alias); - - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&args, 1); - - if (test_collapse) { - printf("testing collapse of uffd memory into PMD-mapped THPs:"); - if (madvise(area_dst_alias, nr_pages * page_size, - MADV_COLLAPSE)) - err("madvise(MADV_COLLAPSE)"); - - uffd_test_ops->check_pmd_mapping(area_dst, - nr_pages * page_size / - read_pmd_pagesize()); - /* - * This won't cause uffd-fault - it purely just makes sure there - * was no corruption. - */ - check_memory_contents(area_dst_alias); - printf(" done.\n"); - } - - return args.missing_faults != 0 || args.minor_faults != nr_pages; -} - static int userfaultfd_stress(void) { void *area; @@ -782,7 +692,7 @@ static int userfaultfd_stress(void) } return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test() || userfaultfd_minor_test(); + || userfaultfd_events_test(); } static void set_test_type(const char *type) @@ -797,13 +707,10 @@ static void set_test_type(const char *type) map_shared = true; test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; - /* Minor faults require shared hugetlb; only enable here. */ - test_uffdio_minor = true; } else if (!strcmp(type, "shmem")) { map_shared = true; test_type = TEST_SHMEM; uffd_test_ops = &shmem_uffd_test_ops; - test_uffdio_minor = true; } } @@ -821,8 +728,6 @@ static void parse_test_type_arg(const char *raw_type) test_dev_userfaultfd = true; else if (!strcmp(token, "syscall")) test_dev_userfaultfd = false; - else if (!strcmp(token, "collapse")) - test_collapse = true; else err("unrecognized test mod '%s'", token); } @@ -830,9 +735,6 @@ static void parse_test_type_arg(const char *raw_type) if (!test_type) err("failed to parse test type argument: '%s'", raw_type); - if (test_collapse && test_type != TEST_SHMEM) - err("Unsupported test: %s", raw_type); - if (test_type == TEST_HUGETLB) page_size = default_huge_page_size(); else @@ -854,8 +756,6 @@ static void parse_test_type_arg(const char *raw_type) test_uffdio_wp = test_uffdio_wp && (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); - test_uffdio_minor = test_uffdio_minor && - (features & uffd_minor_feature()); close(uffd); uffd = -1; @@ -872,7 +772,6 @@ static void sigalrm(int sig) int main(int argc, char **argv) { size_t bytes; - size_t hpage_size = read_pmd_pagesize(); if (argc < 4) usage(); @@ -884,36 +783,8 @@ int main(int argc, char **argv) parse_test_type_arg(argv[1]); bytes = atol(argv[2]) * 1024 * 1024; - if (test_collapse && bytes & (hpage_size - 1)) - err("MiB must be multiple of %lu if :collapse mod set", - hpage_size >> 20); - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - if (test_collapse) { - /* nr_cpus must divide (bytes / page_size), otherwise, - * area allocations of (nr_pages * paze_size) won't be a - * multiple of hpage_size, even if bytes is a multiple of - * hpage_size. - * - * This means that nr_cpus must divide (N * (2 << (H-P)) - * where: - * bytes = hpage_size * N - * hpage_size = 2 << H - * page_size = 2 << P - * - * And we want to chose nr_cpus to be the largest value - * satisfying this constraint, not larger than the number - * of online CPUs. Unfortunately, prime factorization of - * N and nr_cpus may be arbitrary, so have to search for it. - * Instead, just use the highest power of 2 dividing both - * nr_cpus and (bytes / page_size). - */ - int x = factor_of_2(nr_cpus); - int y = factor_of_2(bytes / page_size); - - nr_cpus = x < y ? x : y; - } nr_pages_per_cpu = bytes / page_size / nr_cpus; if (!nr_pages_per_cpu) { _err("invalid MiB"); diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 4690c95a9420..cba04608bdb0 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -329,6 +329,103 @@ static void uffd_pagemap_test(void) uffd_test_pass(); } +static void check_memory_contents(char *p) +{ + unsigned long i, j; + uint8_t expected_byte; + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + for (j = 0; j < page_size; j++) { + uint8_t v = *(uint8_t *)(p + (i * page_size) + j); + if (v != expected_byte) + err("unexpected page contents"); + } + } +} + +static void uffd_minor_test_common(bool test_collapse, bool test_wp) +{ + unsigned long p; + pthread_t uffd_mon; + char c; + struct uffd_args args = { 0 }; + + /* + * NOTE: MADV_COLLAPSE is not yet compatible with WP, so testing + * both do not make much sense. + */ + assert(!(test_collapse && test_wp)); + + if (uffd_register(uffd, area_dst_alias, nr_pages * page_size, + /* NOTE! MADV_COLLAPSE may not work with uffd-wp */ + false, test_wp, true)) + err("register failure"); + + /* + * After registering with UFFD, populate the non-UFFD-registered side of + * the shared mapping. This should *not* trigger any UFFD minor faults. + */ + for (p = 0; p < nr_pages; ++p) + memset(area_dst + (p * page_size), p % ((uint8_t)-1), + page_size); + + args.apply_wp = test_wp; + if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) + err("uffd_poll_thread create"); + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a minor + * fault. uffd_poll_thread will resolve the fault by bit-flipping the + * page's contents, and then issuing a CONTINUE ioctl. + */ + check_memory_contents(area_dst_alias); + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + err("join() failed"); + + if (test_collapse) { + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) { + /* It's fine to fail for this one... */ + uffd_test_skip("MADV_COLLAPSE failed"); + return; + } + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + read_pmd_pagesize()); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + } + + if (args.missing_faults != 0 || args.minor_faults != nr_pages) + uffd_test_fail("stats check error"); + else + uffd_test_pass(); +} + +void uffd_minor_test(void) +{ + uffd_minor_test_common(false, false); +} + +void uffd_minor_wp_test(void) +{ + uffd_minor_test_common(false, true); +} + +void uffd_minor_collapse_test(void) +{ + uffd_minor_test_common(true, false); +} + uffd_test_case_t uffd_tests[] = { { .name = "pagemap", @@ -343,6 +440,29 @@ uffd_test_case_t uffd_tests[] = { .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP | UFFD_FEATURE_WP_UNPOPULATED, }, + { + .name = "minor", + .uffd_fn = uffd_minor_test, + .mem_targets = MEM_SHMEM | MEM_HUGETLB, + .uffd_feature_required = + UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM, + }, + { + .name = "minor-wp", + .uffd_fn = uffd_minor_wp_test, + .mem_targets = MEM_SHMEM | MEM_HUGETLB, + .uffd_feature_required = + UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM | + UFFD_FEATURE_PAGEFAULT_FLAG_WP, + }, + { + .name = "minor-collapse", + .uffd_fn = uffd_minor_collapse_test, + /* MADV_COLLAPSE only works with shmem */ + .mem_targets = MEM_SHMEM, + /* We can't test MADV_COLLAPSE, so try our luck */ + .uffd_feature_required = UFFD_FEATURE_MINOR_SHMEM, + }, }; int main(int argc, char *argv[]) -- cgit v1.2.3 From 73c1ea939b658962345e81e8bcff1439ede25eff Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:44:00 -0400 Subject: selftests/mm: move uffd sig/events tests into uffd unit tests Move the two tests into the unit test, and convert it into 20 standalone tests: - events test on all 5 mem types, with wp on/off - signal test on all 5 mem types, with wp on/off Testing sigbus on anon... done Testing sigbus on shmem... done Testing sigbus on shmem-private... done Testing sigbus on hugetlb... done Testing sigbus on hugetlb-private... done Testing sigbus-wp on anon... done Testing sigbus-wp on shmem... done Testing sigbus-wp on shmem-private... done Testing sigbus-wp on hugetlb... done Testing sigbus-wp on hugetlb-private... done Testing events on anon... done Testing events on shmem... done Testing events on shmem-private... done Testing events on hugetlb... done Testing events on hugetlb-private... done Testing events-wp on anon... done Testing events-wp on shmem... done Testing events-wp on shmem-private... done Testing events-wp on hugetlb... done Testing events-wp on hugetlb-private... done It'll also remove a lot of global references along the way, e.g. test_uffdio_wp will be replaced with the wp value passed over. Link: https://lkml.kernel.org/r/20230412164400.328798-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 227 +---------------------- tools/testing/selftests/mm/uffd-unit-tests.c | 264 +++++++++++++++++++++++++++ 2 files changed, 265 insertions(+), 226 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index f9322bbaf825..ce51180238d8 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -273,133 +273,6 @@ static int stress(struct uffd_args *args) return 0; } -sigjmp_buf jbuf, *sigbuf; - -static void sighndl(int sig, siginfo_t *siginfo, void *ptr) -{ - if (sig == SIGBUS) { - if (sigbuf) - siglongjmp(*sigbuf, 1); - abort(); - } -} - -/* - * For non-cooperative userfaultfd test we fork() a process that will - * generate pagefaults, will mremap the area monitored by the - * userfaultfd and at last this process will release the monitored - * area. - * For the anonymous and shared memory the area is divided into two - * parts, the first part is accessed before mremap, and the second - * part is accessed after mremap. Since hugetlbfs does not support - * mremap, the entire monitored area is accessed in a single pass for - * HUGETLB_TEST. - * The release of the pages currently generates event for shmem and - * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked - * for hugetlb. - * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register - * monitored area, generate pagefaults and test that signal is delivered. - * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 - * test robustness use case - we release monitored area, fork a process - * that will generate pagefaults and verify signal is generated. - * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal - * feature. Using monitor thread, verify no userfault events are generated. - */ -static int faulting_process(int signal_test, bool wp) -{ - unsigned long nr; - unsigned long long count; - unsigned long split_nr_pages; - unsigned long lastnr; - struct sigaction act; - volatile unsigned long signalled = 0; - - split_nr_pages = (nr_pages + 1) / 2; - - if (signal_test) { - sigbuf = &jbuf; - memset(&act, 0, sizeof(act)); - act.sa_sigaction = sighndl; - act.sa_flags = SA_SIGINFO; - if (sigaction(SIGBUS, &act, 0)) - err("sigaction"); - lastnr = (unsigned long)-1; - } - - for (nr = 0; nr < split_nr_pages; nr++) { - volatile int steps = 1; - unsigned long offset = nr * page_size; - - if (signal_test) { - if (sigsetjmp(*sigbuf, 1) != 0) { - if (steps == 1 && nr == lastnr) - err("Signal repeated"); - - lastnr = nr; - if (signal_test == 1) { - if (steps == 1) { - /* This is a MISSING request */ - steps++; - if (copy_page(uffd, offset, wp)) - signalled++; - } else { - /* This is a WP request */ - assert(steps == 2); - wp_range(uffd, - (__u64)area_dst + - offset, - page_size, false); - } - } else { - signalled++; - continue; - } - } - } - - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - if (signal_test) - return signalled != split_nr_pages; - - area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, area_src); - if (area_dst == MAP_FAILED) - err("mremap"); - /* Reset area_src since we just clobbered it */ - area_src = NULL; - - for (; nr < nr_pages; nr++) { - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) { - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - } - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - uffd_test_ops->release_pages(area_dst); - - for (nr = 0; nr < nr_pages; nr++) - if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) - err("nr %lu is not zero", nr); - - return 0; -} - static void retry_uffdio_zeropage(int ufd, struct uffdio_zeropage *uffdio_zeropage, unsigned long offset) @@ -483,103 +356,6 @@ static int userfaultfd_zeropage_test(void) return 0; } -static int userfaultfd_events_test(void) -{ - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_args args = { 0 }; - - printf("testing events (fork, remap, remove): "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | - UFFD_FEATURE_EVENT_REMOVE; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - if (uffd_register(uffd, area_dst, nr_pages * page_size, - true, test_uffdio_wp, false)) - err("register failure"); - - args.apply_wp = test_uffdio_wp; - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &args)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(0, test_uffdio_wp)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&args, 1); - - return args.missing_faults != nr_pages; -} - -static int userfaultfd_sig_test(void) -{ - unsigned long userfaults; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_args args = { 0 }; - - printf("testing signal delivery: "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - if (uffd_register(uffd, area_dst, nr_pages * page_size, - true, test_uffdio_wp, false)) - err("register failure"); - - if (faulting_process(1, test_uffdio_wp)) - err("faulting process failed"); - - uffd_test_ops->release_pages(area_dst); - - args.apply_wp = test_uffdio_wp; - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &args)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(2, test_uffdio_wp)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, (void **)&userfaults)) - return 1; - - printf("done.\n"); - if (userfaults) - err("Signal test failed, userfaults: %ld", userfaults); - - return userfaults != 0; -} - static int userfaultfd_stress(void) { void *area; @@ -691,8 +467,7 @@ static int userfaultfd_stress(void) uffd_stats_report(args, nr_cpus); } - return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test(); + return userfaultfd_zeropage_test(); } static void set_test_type(const char *type) diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index cba04608bdb0..94549696f4b2 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -18,6 +18,9 @@ #define MEM_HUGETLB BIT_ULL(3) #define MEM_HUGETLB_PRIVATE BIT_ULL(4) +#define MEM_ALL (MEM_ANON | MEM_SHMEM | MEM_SHMEM_PRIVATE | \ + MEM_HUGETLB | MEM_HUGETLB_PRIVATE) + struct mem_type { const char *name; unsigned int mem_flag; @@ -426,6 +429,237 @@ void uffd_minor_collapse_test(void) uffd_minor_test_common(true, false); } +static sigjmp_buf jbuf, *sigbuf; + +static void sighndl(int sig, siginfo_t *siginfo, void *ptr) +{ + if (sig == SIGBUS) { + if (sigbuf) + siglongjmp(*sigbuf, 1); + abort(); + } +} + +/* + * For non-cooperative userfaultfd test we fork() a process that will + * generate pagefaults, will mremap the area monitored by the + * userfaultfd and at last this process will release the monitored + * area. + * For the anonymous and shared memory the area is divided into two + * parts, the first part is accessed before mremap, and the second + * part is accessed after mremap. Since hugetlbfs does not support + * mremap, the entire monitored area is accessed in a single pass for + * HUGETLB_TEST. + * The release of the pages currently generates event for shmem and + * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked + * for hugetlb. + * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register + * monitored area, generate pagefaults and test that signal is delivered. + * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 + * test robustness use case - we release monitored area, fork a process + * that will generate pagefaults and verify signal is generated. + * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal + * feature. Using monitor thread, verify no userfault events are generated. + */ +static int faulting_process(int signal_test, bool wp) +{ + unsigned long nr, i; + unsigned long long count; + unsigned long split_nr_pages; + unsigned long lastnr; + struct sigaction act; + volatile unsigned long signalled = 0; + + split_nr_pages = (nr_pages + 1) / 2; + + if (signal_test) { + sigbuf = &jbuf; + memset(&act, 0, sizeof(act)); + act.sa_sigaction = sighndl; + act.sa_flags = SA_SIGINFO; + if (sigaction(SIGBUS, &act, 0)) + err("sigaction"); + lastnr = (unsigned long)-1; + } + + for (nr = 0; nr < split_nr_pages; nr++) { + volatile int steps = 1; + unsigned long offset = nr * page_size; + + if (signal_test) { + if (sigsetjmp(*sigbuf, 1) != 0) { + if (steps == 1 && nr == lastnr) + err("Signal repeated"); + + lastnr = nr; + if (signal_test == 1) { + if (steps == 1) { + /* This is a MISSING request */ + steps++; + if (copy_page(uffd, offset, wp)) + signalled++; + } else { + /* This is a WP request */ + assert(steps == 2); + wp_range(uffd, + (__u64)area_dst + + offset, + page_size, false); + } + } else { + signalled++; + continue; + } + } + } + + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + if (signal_test) + return signalled != split_nr_pages; + + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, area_src); + if (area_dst == MAP_FAILED) + err("mremap"); + /* Reset area_src since we just clobbered it */ + area_src = NULL; + + for (; nr < nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + } + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + uffd_test_ops->release_pages(area_dst); + + for (nr = 0; nr < nr_pages; nr++) + for (i = 0; i < page_size; i++) + if (*(area_dst + nr * page_size + i) != 0) + err("page %lu offset %lu is not zero", nr, i); + + return 0; +} + +static void uffd_sigbus_test_common(bool wp) +{ + unsigned long userfaults; + pthread_t uffd_mon; + pid_t pid; + int err; + char c; + struct uffd_args args = { 0 }; + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + if (uffd_register(uffd, area_dst, nr_pages * page_size, + true, wp, false)) + err("register failure"); + + if (faulting_process(1, wp)) + err("faulting process failed"); + + uffd_test_ops->release_pages(area_dst); + + args.apply_wp = wp; + if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(2, wp)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, (void **)&userfaults)) + err("pthread_join()"); + + if (userfaults) + uffd_test_fail("Signal test failed, userfaults: %ld", userfaults); + else + uffd_test_pass(); +} + +static void uffd_sigbus_test(void) +{ + uffd_sigbus_test_common(false); +} + +static void uffd_sigbus_wp_test(void) +{ + uffd_sigbus_test_common(true); +} + +static void uffd_events_test_common(bool wp) +{ + pthread_t uffd_mon; + pid_t pid; + int err; + char c; + struct uffd_args args = { 0 }; + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + if (uffd_register(uffd, area_dst, nr_pages * page_size, + true, wp, false)) + err("register failure"); + + args.apply_wp = wp; + if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(0, wp)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + err("pthread_join()"); + + if (args.missing_faults != nr_pages) + uffd_test_fail("Fault counts wrong"); + else + uffd_test_pass(); +} + +static void uffd_events_test(void) +{ + uffd_events_test_common(false); +} + +static void uffd_events_wp_test(void) +{ + uffd_events_test_common(true); +} + uffd_test_case_t uffd_tests[] = { { .name = "pagemap", @@ -463,6 +697,36 @@ uffd_test_case_t uffd_tests[] = { /* We can't test MADV_COLLAPSE, so try our luck */ .uffd_feature_required = UFFD_FEATURE_MINOR_SHMEM, }, + { + .name = "sigbus", + .uffd_fn = uffd_sigbus_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_SIGBUS | + UFFD_FEATURE_EVENT_FORK, + }, + { + .name = "sigbus-wp", + .uffd_fn = uffd_sigbus_wp_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_SIGBUS | + UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_PAGEFAULT_FLAG_WP, + }, + { + .name = "events", + .uffd_fn = uffd_events_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_EVENT_FORK | + UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE, + }, + { + .name = "events-wp", + .uffd_fn = uffd_events_wp_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_EVENT_FORK | + UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE | + UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM, + }, }; int main(int argc, char *argv[]) -- cgit v1.2.3 From c3315502c92406ec5bf36ba687f9651b43f838f6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:44:04 -0400 Subject: selftests/mm: move zeropage test into uffd unit tests Simplifies it a bit along the way, e.g., drop the never used offset field (which was always the 1st page so offset=0). Introduce uffd_register_with_ioctls() out of uffd_register() to detect uffdio_register.ioctls got returned. Check that automatically when testing UFFDIO_ZEROPAGE on different types of memory (and kernel). Link: https://lkml.kernel.org/r/20230412164404.328815-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 94 +--------------------------- tools/testing/selftests/mm/uffd-unit-tests.c | 93 +++++++++++++++++++++++++++ tools/testing/selftests/mm/vm_util.c | 14 ++++- tools/testing/selftests/mm/vm_util.h | 2 + 4 files changed, 108 insertions(+), 95 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index ce51180238d8..d78f88850011 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -109,15 +109,6 @@ static inline uint64_t uffd_minor_feature(void) return 0; } -static int my_bcmp(char *str1, char *str2, size_t n) -{ - unsigned long i; - for (i = 0; i < n; i++) - if (str1[i] != str2[i]) - return 1; - return 0; -} - static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -273,89 +264,6 @@ static int stress(struct uffd_args *args) return 0; } -static void retry_uffdio_zeropage(int ufd, - struct uffdio_zeropage *uffdio_zeropage, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, - uffdio_zeropage->range.len, - offset); - if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { - if (uffdio_zeropage->zeropage != -EEXIST) - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } else { - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } -} - -static int __uffdio_zeropage(int ufd, unsigned long offset) -{ - struct uffdio_zeropage uffdio_zeropage; - int ret; - bool has_zeropage = !(test_type == TEST_HUGETLB); - __s64 res; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu", offset); - uffdio_zeropage.range.start = (unsigned long) area_dst + offset; - uffdio_zeropage.range.len = page_size; - uffdio_zeropage.mode = 0; - ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); - res = uffdio_zeropage.zeropage; - if (ret) { - /* real retval in ufdio_zeropage.zeropage */ - if (has_zeropage) - err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); - else if (res != -EINVAL) - err("UFFDIO_ZEROPAGE not -EINVAL"); - } else if (has_zeropage) { - if (res != page_size) { - err("UFFDIO_ZEROPAGE unexpected size"); - } else { - retry_uffdio_zeropage(ufd, &uffdio_zeropage, - offset); - return 1; - } - } else - err("UFFDIO_ZEROPAGE succeeded"); - - return 0; -} - -static int uffdio_zeropage(int ufd, unsigned long offset) -{ - return __uffdio_zeropage(ufd, offset); -} - -/* exercise UFFDIO_ZEROPAGE */ -static int userfaultfd_zeropage_test(void) -{ - printf("testing UFFDIO_ZEROPAGE: "); - fflush(stdout); - - uffd_test_ctx_init(0); - - if (uffd_register(uffd, area_dst, nr_pages * page_size, - true, test_uffdio_wp, false)) - err("register failure"); - - if (area_dst_alias) { - /* Needed this to test zeropage-retry on shared memory */ - if (uffd_register(uffd, area_dst_alias, nr_pages * page_size, - true, test_uffdio_wp, false)) - err("register failure"); - } - - if (uffdio_zeropage(uffd, 0)) - if (my_bcmp(area_dst, zeropage, page_size)) - err("zeropage is not zero"); - - printf("done.\n"); - return 0; -} - static int userfaultfd_stress(void) { void *area; @@ -467,7 +375,7 @@ static int userfaultfd_stress(void) uffd_stats_report(args, nr_cpus); } - return userfaultfd_zeropage_test(); + return 0; } static void set_test_type(const char *type) diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 94549696f4b2..160bd8ccda55 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -660,7 +660,100 @@ static void uffd_events_wp_test(void) uffd_events_test_common(true); } +static void retry_uffdio_zeropage(int ufd, + struct uffdio_zeropage *uffdio_zeropage) +{ + uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, + uffdio_zeropage->range.len, + 0); + if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { + if (uffdio_zeropage->zeropage != -EEXIST) + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } else { + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } +} + +static bool do_uffdio_zeropage(int ufd, bool has_zeropage) +{ + struct uffdio_zeropage uffdio_zeropage = { 0 }; + int ret; + __s64 res; + + uffdio_zeropage.range.start = (unsigned long) area_dst; + uffdio_zeropage.range.len = page_size; + uffdio_zeropage.mode = 0; + ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + res = uffdio_zeropage.zeropage; + if (ret) { + /* real retval in ufdio_zeropage.zeropage */ + if (has_zeropage) + err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); + else if (res != -EINVAL) + err("UFFDIO_ZEROPAGE not -EINVAL"); + } else if (has_zeropage) { + if (res != page_size) + err("UFFDIO_ZEROPAGE unexpected size"); + else + retry_uffdio_zeropage(ufd, &uffdio_zeropage); + return true; + } else + err("UFFDIO_ZEROPAGE succeeded"); + + return false; +} + +/* + * Registers a range with MISSING mode only for zeropage test. Return true + * if UFFDIO_ZEROPAGE supported, false otherwise. Can't use uffd_register() + * because we want to detect .ioctls along the way. + */ +static bool +uffd_register_detect_zeropage(int uffd, void *addr, uint64_t len) +{ + uint64_t ioctls = 0; + + if (uffd_register_with_ioctls(uffd, addr, len, true, + false, false, &ioctls)) + err("zeropage register fail"); + + return ioctls & (1 << _UFFDIO_ZEROPAGE); +} + +/* exercise UFFDIO_ZEROPAGE */ +static void uffd_zeropage_test(void) +{ + bool has_zeropage; + int i; + + has_zeropage = uffd_register_detect_zeropage(uffd, area_dst, page_size); + if (area_dst_alias) + /* Ignore the retval; we already have it */ + uffd_register_detect_zeropage(uffd, area_dst_alias, page_size); + + if (do_uffdio_zeropage(uffd, has_zeropage)) + for (i = 0; i < page_size; i++) + if (area_dst[i] != 0) + err("data non-zero at offset %d\n", i); + + if (uffd_unregister(uffd, area_dst, page_size)) + err("unregister"); + + if (area_dst_alias && uffd_unregister(uffd, area_dst_alias, page_size)) + err("unregister"); + + uffd_test_pass(); +} + uffd_test_case_t uffd_tests[] = { + { + .name = "zeropage", + .uffd_fn = uffd_zeropage_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = 0, + }, { .name = "pagemap", .uffd_fn = uffd_pagemap_test, diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 1bc0ceb01adb..9b06a5034808 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -198,8 +198,9 @@ unsigned long default_huge_page_size(void) return hps; } -int uffd_register(int uffd, void *addr, uint64_t len, - bool miss, bool wp, bool minor) +/* If `ioctls' non-NULL, the allowed ioctls will be returned into the var */ +int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, + bool miss, bool wp, bool minor, uint64_t *ioctls) { struct uffdio_register uffdio_register = { 0 }; uint64_t mode = 0; @@ -218,10 +219,19 @@ int uffd_register(int uffd, void *addr, uint64_t len, if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) ret = -errno; + else if (ioctls) + *ioctls = uffdio_register.ioctls; return ret; } +int uffd_register(int uffd, void *addr, uint64_t len, + bool miss, bool wp, bool minor) +{ + return uffd_register_with_ioctls(uffd, addr, len, + miss, wp, minor, NULL); +} + int uffd_unregister(int uffd, void *addr, uint64_t len) { struct uffdio_range range = { .start = (uintptr_t)addr, .len = len }; diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 634eb2f41145..b950bd16083a 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -52,6 +52,8 @@ int uffd_open_dev(unsigned int flags); int uffd_open_sys(unsigned int flags); int uffd_open(unsigned int flags); int uffd_get_features(uint64_t *features); +int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, + bool miss, bool wp, bool minor, uint64_t *ioctls); /* * On ppc64 this will only work with radix 2M hugepage size -- cgit v1.2.3 From 4df9cefa9419718c32259c105ac8fbbc680410f7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:45:17 -0400 Subject: selftests/mm: workaround no way to detect uffd-minor + wp Userfaultfd minor+wp mode was very recently added. The test will fail on the old kernels at ioctl(UFFDIO_CONTINUE) which is misterious. Unfortunately there's no feature bit to detect for this support. Add a hack to leverage WP_UNPOPULATED to detect whether that feature existed, since WP_UNPOPULATED was merged right after minor+wp. Link: https://lkml.kernel.org/r/20230412164517.329152-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 160bd8ccda55..a33d6c928eeb 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -780,7 +780,13 @@ uffd_test_case_t uffd_tests[] = { .mem_targets = MEM_SHMEM | MEM_HUGETLB, .uffd_feature_required = UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM | - UFFD_FEATURE_PAGEFAULT_FLAG_WP, + UFFD_FEATURE_PAGEFAULT_FLAG_WP | + /* + * HACK: here we leveraged WP_UNPOPULATED to detect whether + * minor mode supports wr-protect. There's no feature flag + * for it so this is the best we can test against. + */ + UFFD_FEATURE_WP_UNPOPULATED, }, { .name = "minor-collapse", -- cgit v1.2.3 From f9da24263db43da12f1e105cb8ac5e02c66f12d0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:45:20 -0400 Subject: selftests/mm: allow uffd test to skip properly with no privilege Allow skip a unit test properly due to no privilege (e.g. sigbus and events tests). [colin.i.king@gmail.com: fix spelling mistake "priviledge" -> "privilege"] Link: https://lkml.kernel.org/r/20230414081506.1678998-1-colin.i.king@gmail.com Link: https://lkml.kernel.org/r/20230412164520.329163-1-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Colin Ian King Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 27 +++++++++++++++++---------- tools/testing/selftests/mm/uffd-common.h | 4 ++-- tools/testing/selftests/mm/uffd-stress.c | 6 ++++-- tools/testing/selftests/mm/uffd-unit-tests.c | 10 ++++++---- 4 files changed, 29 insertions(+), 18 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 12ac84712a38..3e98e129f8bd 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -232,7 +232,7 @@ void uffd_stats_report(struct uffd_args *args, int n_cpus) printf("\n"); } -void userfaultfd_open(uint64_t *features) +int userfaultfd_open(uint64_t *features) { struct uffdio_api uffdio_api; @@ -241,18 +241,19 @@ void userfaultfd_open(uint64_t *features) else uffd = uffd_open_sys(UFFD_FLAGS); if (uffd < 0) - err("uffd open failed (dev=%d)", test_dev_userfaultfd); + return -1; uffd_flags = fcntl(uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; uffdio_api.features = *features; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) - err("UFFDIO_API failed.\nPlease make sure to " - "run with either root or ptrace capability."); + /* Probably lack of CAP_PTRACE? */ + return -1; if (uffdio_api.api != UFFD_API) err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); *features = uffdio_api.features; + return 0; } static inline void munmap_area(void **area) @@ -295,7 +296,7 @@ static void uffd_test_ctx_clear(void) munmap_area((void **)&area_remap); } -int uffd_test_ctx_init(uint64_t features) +int uffd_test_ctx_init(uint64_t features, const char **errmsg) { unsigned long nr, cpu; int ret; @@ -303,13 +304,19 @@ int uffd_test_ctx_init(uint64_t features) uffd_test_ctx_clear(); ret = uffd_test_ops->allocate_area((void **)&area_src, true); - if (ret) - return ret; - ret = uffd_test_ops->allocate_area((void **)&area_dst, false); - if (ret) + ret |= uffd_test_ops->allocate_area((void **)&area_dst, false); + if (ret) { + if (errmsg) + *errmsg = "memory allocation failed"; return ret; + } - userfaultfd_open(&features); + ret = userfaultfd_open(&features); + if (ret) { + if (errmsg) + *errmsg = "possible lack of priviledge"; + return ret; + } count_verify = malloc(nr_pages * sizeof(unsigned long long)); if (!count_verify) diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 4bd5915cf5b4..32e590ce9442 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -101,8 +101,8 @@ extern uffd_test_ops_t hugetlb_uffd_test_ops; extern uffd_test_ops_t *uffd_test_ops; void uffd_stats_report(struct uffd_args *args, int n_cpus); -int uffd_test_ctx_init(uint64_t features); -void userfaultfd_open(uint64_t *features); +int uffd_test_ctx_init(uint64_t features, const char **errmsg); +int userfaultfd_open(uint64_t *features); int uffd_read_msg(int ufd, struct uffd_msg *msg); void wp_range(int ufd, __u64 start, __u64 len, bool wp); void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args); diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index d78f88850011..c0e804f05002 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -271,7 +271,8 @@ static int userfaultfd_stress(void) struct uffd_args args[nr_cpus]; uint64_t mem_size = nr_pages * page_size; - uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); + if (uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED, NULL)) + err("context init failed"); if (posix_memalign(&area, page_size, page_size)) err("out of memory"); @@ -435,7 +436,8 @@ static void parse_test_type_arg(const char *raw_type) * feature. */ - userfaultfd_open(&features); + if (userfaultfd_open(&features)) + err("Userfaultfd open failed"); test_uffdio_wp = test_uffdio_wp && (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index a33d6c928eeb..b0acf558e8cb 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -172,7 +172,8 @@ out: * This function initializes the global variables. TODO: remove global * vars and then remove this. */ -static int uffd_setup_environment(uffd_test_case_t *test, mem_type_t *mem_type) +static int uffd_setup_environment(uffd_test_case_t *test, mem_type_t *mem_type, + const char **errmsg) { map_shared = mem_type->shared; uffd_test_ops = mem_type->mem_ops; @@ -186,7 +187,7 @@ static int uffd_setup_environment(uffd_test_case_t *test, mem_type_t *mem_type) /* TODO: remove this global var.. it's so ugly */ nr_cpus = 1; - return uffd_test_ctx_init(test->uffd_feature_required); + return uffd_test_ctx_init(test->uffd_feature_required, errmsg); } static bool uffd_feature_supported(uffd_test_case_t *test) @@ -835,6 +836,7 @@ int main(int argc, char *argv[]) uffd_test_case_t *test; mem_type_t *mem_type; char test_name[128]; + const char *errmsg; int has_uffd; int i, j; @@ -860,8 +862,8 @@ int main(int argc, char *argv[]) uffd_test_skip("feature missing"); continue; } - if (uffd_setup_environment(test, mem_type)) { - uffd_test_skip("environment setup failed"); + if (uffd_setup_environment(test, mem_type, &errmsg)) { + uffd_test_skip(errmsg); continue; } test->uffd_fn(); -- cgit v1.2.3 From 111fd29b2aed34f1841015df2196a9b489da06b0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:45:25 -0400 Subject: selftests/mm: drop sys/dev test in uffd-stress test With the new uffd unit test covering the /dev/userfaultfd path and syscall path of uffd initializations, we can safely drop the devnode test in the old stress test. One thing is to avoid duplication of running the stress test twice which is an overkill to only test the /dev/ interface in run_vmtests.sh. The other benefit is now all uffd tests (that uses userfaultfd_open) can run automatically as long as any type of interface is enabled (either syscall or dev), so it's more likely to succeed rather than fail due to unprivilege. With this patch lands, we can drop all the "mem_type:XXX" handlings too. Link: https://lkml.kernel.org/r/20230412164525.329176-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 15 ++++++--------- tools/testing/selftests/mm/uffd-common.c | 7 ++----- tools/testing/selftests/mm/uffd-common.h | 2 +- tools/testing/selftests/mm/uffd-stress.c | 27 ++------------------------- 4 files changed, 11 insertions(+), 40 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index efe22dc569f0..ecc16ea6fc40 100644 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -197,16 +197,13 @@ CATEGORY="gup_test" run_test ./gup_test -a CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 CATEGORY="userfaultfd" run_test ./uffd-unit-tests -uffd_mods=("" ":dev") uffd_stress_bin=./uffd-stress -for mod in "${uffd_mods[@]}"; do - CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon${mod} 20 16 - # Hugetlb tests require source and destination huge pages. Pass in half - # the size ($half_ufd_size_MB), which is used for *each*. - CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb_shared${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem${mod} 20 16 -done +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 +# Hugetlb tests require source and destination huge pages. Pass in half +# the size ($half_ufd_size_MB), which is used for *each*. +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb_shared "$half_ufd_size_MB" 32 +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 #cleanup echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 3e98e129f8bd..61c6250adf93 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -13,7 +13,7 @@ volatile bool test_uffdio_copy_eexist = true; unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; int uffd = -1, uffd_flags, finished, *pipefd, test_type; -bool map_shared, test_dev_userfaultfd; +bool map_shared; bool test_uffdio_wp = true; unsigned long long *count_verify; uffd_test_ops_t *uffd_test_ops; @@ -236,10 +236,7 @@ int userfaultfd_open(uint64_t *features) { struct uffdio_api uffdio_api; - if (test_dev_userfaultfd) - uffd = uffd_open_dev(UFFD_FLAGS); - else - uffd = uffd_open_sys(UFFD_FLAGS); + uffd = uffd_open(UFFD_FLAGS); if (uffd < 0) return -1; uffd_flags = fcntl(uffd, F_GETFD, NULL); diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 32e590ce9442..6068f2346b86 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -90,7 +90,7 @@ typedef struct uffd_test_ops uffd_test_ops_t; extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; extern int uffd, uffd_flags, finished, *pipefd, test_type; -extern bool map_shared, test_dev_userfaultfd; +extern bool map_shared; extern bool test_uffdio_wp; extern unsigned long long *count_verify; extern volatile bool test_uffdio_copy_eexist; diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index c0e804f05002..4e071a7d0ff5 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -55,8 +55,6 @@ pthread_attr_t attr; const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" - "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" - "./userfaultfd anon:dev 100 99999\n\n" "# Run share memory test on 1GiB region with 99 bounces:\n" "./userfaultfd shmem 1000 99\n\n" "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" @@ -69,18 +67,9 @@ const char *examples = static void usage(void) { - fprintf(stderr, "\nUsage: ./userfaultfd " - "[hugetlbfs_file]\n\n"); + fprintf(stderr, "\nUsage: ./userfaultfd \n\n"); fprintf(stderr, "Supported : anon, hugetlb, " "hugetlb_shared, shmem\n\n"); - fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " - "Supported mods:\n"); - fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); - fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); - fprintf(stderr, "\nExample test mod usage:\n"); - fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); - fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); - fprintf(stderr, "Examples:\n\n"); fprintf(stderr, "%s", examples); exit(1); @@ -400,21 +389,9 @@ static void set_test_type(const char *type) static void parse_test_type_arg(const char *raw_type) { - char *buf = strdup(raw_type); uint64_t features = UFFD_API_FEATURES; - while (buf) { - const char *token = strsep(&buf, ":"); - - if (!test_type) - set_test_type(token); - else if (!strcmp(token, "dev")) - test_dev_userfaultfd = true; - else if (!strcmp(token, "syscall")) - test_dev_userfaultfd = false; - else - err("unrecognized test mod '%s'", token); - } + set_test_type(raw_type); if (!test_type) err("failed to parse test type argument: '%s'", raw_type); -- cgit v1.2.3 From 5aec236fdd69d39f9fa5e579d10f7a45ed11b005 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:45:46 -0400 Subject: selftests/mm: add shmem-private test to uffd-stress The userfaultfd stress test never tested private shmem, which I think was overlooked long due. Add it so it matches with uffd unit test and it'll cover all memory supported with the three memory types. Meanwhile, rename the memory types a bit. Considering shared mem is the major use case for both shmem / hugetlbfs, changing from: anon, hugetlb, hugetlb_shared, shmem To (with shmem-private added): anon, hugetlb, hugetlb-private, shmem, shmem-private Add the shmem-private to run_vmtests.sh too. Link: https://lkml.kernel.org/r/20230412164546.329355-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 3 ++- tools/testing/selftests/mm/uffd-stress.c | 11 +++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index ecc16ea6fc40..438eb49567b6 100644 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -202,8 +202,9 @@ CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 # Hugetlb tests require source and destination huge pages. Pass in half # the size ($half_ufd_size_MB), which is used for *each*. CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 -CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb_shared "$half_ufd_size_MB" 32 +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem-private 20 16 #cleanup echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 4e071a7d0ff5..f1ad9eef1c3a 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -59,8 +59,8 @@ const char *examples = "./userfaultfd shmem 1000 99\n\n" "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" "./userfaultfd hugetlb 256 50\n\n" - "# Run the same hugetlb test but using shared file:\n" - "./userfaultfd hugetlb_shared 256 50\n\n" + "# Run the same hugetlb test but using private file:\n" + "./userfaultfd hugetlb-private 256 50\n\n" "# 10MiB-~6GiB 999 bounces anonymous test, " "continue forever unless an error triggers\n" "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; @@ -69,7 +69,7 @@ static void usage(void) { fprintf(stderr, "\nUsage: ./userfaultfd \n\n"); fprintf(stderr, "Supported : anon, hugetlb, " - "hugetlb_shared, shmem\n\n"); + "hugetlb-private, shmem, shmem-private\n\n"); fprintf(stderr, "Examples:\n\n"); fprintf(stderr, "%s", examples); exit(1); @@ -376,14 +376,17 @@ static void set_test_type(const char *type) } else if (!strcmp(type, "hugetlb")) { test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; - } else if (!strcmp(type, "hugetlb_shared")) { map_shared = true; + } else if (!strcmp(type, "hugetlb-private")) { test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; } else if (!strcmp(type, "shmem")) { map_shared = true; test_type = TEST_SHMEM; uffd_test_ops = &shmem_uffd_test_ops; + } else if (!strcmp(type, "shmem-private")) { + test_type = TEST_SHMEM; + uffd_test_ops = &shmem_uffd_test_ops; } } -- cgit v1.2.3 From 43759d44dc346273efd79e99ba5d195ed6c2bfa2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:45:48 -0400 Subject: selftests/mm: add uffdio register ioctls test This new test tests against the returned ioctls from UFFDIO_REGISTER, where put into uffdio_register.ioctls. This also tests the expected failure cases of UFFDIO_REGISTER, aka: - Register with empty mode should fail with -EINVAL - Register minor without page cache (anon) should fail with -EINVAL Link: https://lkml.kernel.org/r/20230412164548.329376-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 112 +++++++++++++++++++++++---- 1 file changed, 97 insertions(+), 15 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index b0acf558e8cb..d871bf732e62 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -62,8 +62,14 @@ mem_type_t mem_types[] = { }, }; +/* Arguments to be passed over to each uffd unit test */ +struct uffd_test_args { + mem_type_t *mem_type; +}; +typedef struct uffd_test_args uffd_test_args_t; + /* Returns: UFFD_TEST_* */ -typedef void (*uffd_test_fn)(void); +typedef void (*uffd_test_fn)(uffd_test_args_t *); typedef struct { const char *name; @@ -172,8 +178,9 @@ out: * This function initializes the global variables. TODO: remove global * vars and then remove this. */ -static int uffd_setup_environment(uffd_test_case_t *test, mem_type_t *mem_type, - const char **errmsg) +static int +uffd_setup_environment(uffd_test_args_t *args, uffd_test_case_t *test, + mem_type_t *mem_type, const char **errmsg) { map_shared = mem_type->shared; uffd_test_ops = mem_type->mem_ops; @@ -187,6 +194,9 @@ static int uffd_setup_environment(uffd_test_case_t *test, mem_type_t *mem_type, /* TODO: remove this global var.. it's so ugly */ nr_cpus = 1; + /* Initialize test arguments */ + args->mem_type = mem_type; + return uffd_test_ctx_init(test->uffd_feature_required, errmsg); } @@ -239,7 +249,7 @@ static int pagemap_test_fork(bool present) return result; } -static void uffd_wp_unpopulated_test(void) +static void uffd_wp_unpopulated_test(uffd_test_args_t *args) { uint64_t value; int pagemap_fd; @@ -285,7 +295,7 @@ static void uffd_wp_unpopulated_test(void) uffd_test_pass(); } -static void uffd_pagemap_test(void) +static void uffd_pagemap_test(uffd_test_args_t *args) { int pagemap_fd; uint64_t value; @@ -415,17 +425,17 @@ static void uffd_minor_test_common(bool test_collapse, bool test_wp) uffd_test_pass(); } -void uffd_minor_test(void) +void uffd_minor_test(uffd_test_args_t *args) { uffd_minor_test_common(false, false); } -void uffd_minor_wp_test(void) +void uffd_minor_wp_test(uffd_test_args_t *args) { uffd_minor_test_common(false, true); } -void uffd_minor_collapse_test(void) +void uffd_minor_collapse_test(uffd_test_args_t *args) { uffd_minor_test_common(true, false); } @@ -603,12 +613,12 @@ static void uffd_sigbus_test_common(bool wp) uffd_test_pass(); } -static void uffd_sigbus_test(void) +static void uffd_sigbus_test(uffd_test_args_t *args) { uffd_sigbus_test_common(false); } -static void uffd_sigbus_wp_test(void) +static void uffd_sigbus_wp_test(uffd_test_args_t *args) { uffd_sigbus_test_common(true); } @@ -651,12 +661,12 @@ static void uffd_events_test_common(bool wp) uffd_test_pass(); } -static void uffd_events_test(void) +static void uffd_events_test(uffd_test_args_t *args) { uffd_events_test_common(false); } -static void uffd_events_wp_test(void) +static void uffd_events_wp_test(uffd_test_args_t *args) { uffd_events_test_common(true); } @@ -724,7 +734,7 @@ uffd_register_detect_zeropage(int uffd, void *addr, uint64_t len) } /* exercise UFFDIO_ZEROPAGE */ -static void uffd_zeropage_test(void) +static void uffd_zeropage_test(uffd_test_args_t *args) { bool has_zeropage; int i; @@ -748,7 +758,77 @@ static void uffd_zeropage_test(void) uffd_test_pass(); } +/* + * Test the returned uffdio_register.ioctls with different register modes. + * Note that _UFFDIO_ZEROPAGE is tested separately in the zeropage test. + */ +static void +do_register_ioctls_test(uffd_test_args_t *args, bool miss, bool wp, bool minor) +{ + uint64_t ioctls = 0, expected = BIT_ULL(_UFFDIO_WAKE); + mem_type_t *mem_type = args->mem_type; + int ret; + + ret = uffd_register_with_ioctls(uffd, area_dst, page_size, + miss, wp, minor, &ioctls); + + /* + * Handle special cases of UFFDIO_REGISTER here where it should + * just fail with -EINVAL first.. + * + * Case 1: register MINOR on anon + * Case 2: register with no mode selected + */ + if ((minor && (mem_type->mem_flag == MEM_ANON)) || + (!miss && !wp && !minor)) { + if (ret != -EINVAL) + err("register (miss=%d, wp=%d, minor=%d) failed " + "with wrong errno=%d", miss, wp, minor, ret); + return; + } + + /* UFFDIO_REGISTER should succeed, then check ioctls returned */ + if (miss) + expected |= BIT_ULL(_UFFDIO_COPY); + if (wp) + expected |= BIT_ULL(_UFFDIO_WRITEPROTECT); + if (minor) + expected |= BIT_ULL(_UFFDIO_CONTINUE); + + if ((ioctls & expected) != expected) + err("unexpected uffdio_register.ioctls " + "(miss=%d, wp=%d, minor=%d): expected=0x%"PRIx64", " + "returned=0x%"PRIx64, miss, wp, minor, expected, ioctls); + + if (uffd_unregister(uffd, area_dst, page_size)) + err("unregister"); +} + +static void uffd_register_ioctls_test(uffd_test_args_t *args) +{ + int miss, wp, minor; + + for (miss = 0; miss <= 1; miss++) + for (wp = 0; wp <= 1; wp++) + for (minor = 0; minor <= 1; minor++) + do_register_ioctls_test(args, miss, wp, minor); + + uffd_test_pass(); +} + uffd_test_case_t uffd_tests[] = { + { + /* Test returned uffdio_register.ioctls. */ + .name = "register-ioctls", + .uffd_fn = uffd_register_ioctls_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_MISSING_HUGETLBFS | + UFFD_FEATURE_MISSING_SHMEM | + UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | + UFFD_FEATURE_MINOR_HUGETLBFS | + UFFD_FEATURE_MINOR_SHMEM, + }, { .name = "zeropage", .uffd_fn = uffd_zeropage_test, @@ -835,6 +915,7 @@ int main(int argc, char *argv[]) int n_mems = sizeof(mem_types) / sizeof(mem_type_t); uffd_test_case_t *test; mem_type_t *mem_type; + uffd_test_args_t args; char test_name[128]; const char *errmsg; int has_uffd; @@ -862,11 +943,12 @@ int main(int argc, char *argv[]) uffd_test_skip("feature missing"); continue; } - if (uffd_setup_environment(test, mem_type, &errmsg)) { + if (uffd_setup_environment(&args, test, mem_type, + &errmsg)) { uffd_test_skip(errmsg); continue; } - test->uffd_fn(); + test->uffd_fn(&args); } } -- cgit v1.2.3 From cd834afa8ee3751644534be20d63cff445641f0f Mon Sep 17 00:00:00 2001 From: Chaitanya S Prakash Date: Thu, 23 Mar 2023 16:22:39 +0530 Subject: selftests/mm: add support for arm64 platform on va switch Patch series "selftests/mm: Implement support for arm64 on va". The va_128TBswitch selftest is designed and implemented for PowerPC and x86 architectures which support a 128TB switch, up to 256TB of virtual address space and hugepage sizes of 16MB and 2MB respectively. Arm64 platforms on the other hand support a 256Tb switch, up to 4PB of virtual address space and a default hugepage size of 512MB when 64k pagesize is enabled. These architectural differences require introducing support for arm64 platforms, after which a more generic naming convention is suggested. The in code comments are amended to provide a more platform independent explanation of the working of the code and nr_hugepages are configured as required. Finally, the file running the testcase is modified in order to prevent skipping of hugetlb testcases of va_high_addr_switch. This patch (of 5): Arm64 platforms have the ability to support 64kb pagesize, 512MB default hugepage size and up to 4PB of virtual address space. The address switch occurs at 256TB as opposed to 128TB. Hence, the necessary support has been added. Link: https://lkml.kernel.org/r/20230323105243.2807166-1-chaitanyas.prakash@arm.com Link: https://lkml.kernel.org/r/20230323105243.2807166-2-chaitanyas.prakash@arm.com Signed-off-by: Chaitanya S Prakash Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: Shuah Khan Cc: Mark Rutland Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_128TBswitch.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/va_128TBswitch.c b/tools/testing/selftests/mm/va_128TBswitch.c index 1d2068989883..6679213effed 100644 --- a/tools/testing/selftests/mm/va_128TBswitch.c +++ b/tools/testing/selftests/mm/va_128TBswitch.c @@ -17,6 +17,13 @@ * This will work with 16M and 2M hugepage size */ #define HUGETLB_SIZE (16 << 20) +#elif __aarch64__ +/* + * The default hugepage size for 64k base pagesize + * is 512MB. + */ +#define PAGE_SIZE (64 << 10) +#define HUGETLB_SIZE (512 << 20) #else #define PAGE_SIZE (4 << 10) #define HUGETLB_SIZE (2 << 20) @@ -26,9 +33,22 @@ * >= 128TB is the hint addr value we used to select * large address space. */ -#define ADDR_SWITCH_HINT (1UL << 47) + +#define ADDR_MARK_128TB (1UL << 47) +#define ADDR_MARK_256TB (1UL << 48) + +#define HIGH_ADDR_128TB ((void *) (1UL << 48)) +#define HIGH_ADDR_256TB ((void *) (1UL << 49)) + #define LOW_ADDR ((void *) (1UL << 30)) -#define HIGH_ADDR ((void *) (1UL << 48)) + +#ifdef __aarch64__ +#define ADDR_SWITCH_HINT ADDR_MARK_256TB +#define HIGH_ADDR HIGH_ADDR_256TB +#else +#define ADDR_SWITCH_HINT ADDR_MARK_128TB +#define HIGH_ADDR HIGH_ADDR_128TB +#endif struct testcase { void *addr; @@ -270,6 +290,8 @@ static int supported_arch(void) return 1; #elif defined(__x86_64__) return 1; +#elif defined(__aarch64__) + return 1; #else return 0; #endif -- cgit v1.2.3 From bbe168729d4ef8305f4e35c3bbe4711389ab8354 Mon Sep 17 00:00:00 2001 From: Chaitanya S Prakash Date: Thu, 23 Mar 2023 16:22:40 +0530 Subject: selftests/mm: rename va_128TBswitch to va_high_addr_switch As the initial selftest only took into consideration PowperPC and x86 architectures, on adding support for arm64, a platform independent naming convention is chosen. Link: https://lkml.kernel.org/r/20230323105243.2807166-3-chaitanyas.prakash@arm.com Signed-off-by: Chaitanya S Prakash Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: Shuah Khan Cc: Mark Rutland Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 +- tools/testing/selftests/mm/run_vmtests.sh | 2 +- tools/testing/selftests/mm/va_128TBswitch.c | 311 ---------------------- tools/testing/selftests/mm/va_128TBswitch.sh | 54 ---- tools/testing/selftests/mm/va_high_addr_switch.c | 311 ++++++++++++++++++++++ tools/testing/selftests/mm/va_high_addr_switch.sh | 54 ++++ 6 files changed, 368 insertions(+), 368 deletions(-) delete mode 100644 tools/testing/selftests/mm/va_128TBswitch.c delete mode 100644 tools/testing/selftests/mm/va_128TBswitch.sh create mode 100644 tools/testing/selftests/mm/va_high_addr_switch.c create mode 100644 tools/testing/selftests/mm/va_high_addr_switch.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 63c03a6414fc..de32f7dafa5d 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -94,7 +94,7 @@ endif endif ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) -TEST_GEN_PROGS += va_128TBswitch +TEST_GEN_PROGS += va_high_addr_switch TEST_GEN_PROGS += virtual_address_range TEST_GEN_PROGS += write_to_hugetlbfs endif @@ -103,7 +103,7 @@ TEST_PROGS := run_vmtests.sh TEST_FILES := test_vmalloc.sh TEST_FILES += test_hmm.sh -TEST_FILES += va_128TBswitch.sh +TEST_FILES += va_high_addr_switch.sh include ../lib.mk diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 438eb49567b6..83b59ae2d6f0 100644 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -237,7 +237,7 @@ if [ $VADDR64 -ne 0 ]; then echo $prev_policy > /proc/sys/vm/overcommit_memory # virtual address 128TB switch test - CATEGORY="hugevm" run_test ./va_128TBswitch.sh + CATEGORY="hugevm" run_test ./va_high_addr_switch.sh fi # VADDR64 # vmalloc stability smoke test diff --git a/tools/testing/selftests/mm/va_128TBswitch.c b/tools/testing/selftests/mm/va_128TBswitch.c deleted file mode 100644 index 6679213effed..000000000000 --- a/tools/testing/selftests/mm/va_128TBswitch.c +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * - * Authors: Kirill A. Shutemov - * Authors: Aneesh Kumar K.V - */ - -#include -#include -#include - -#include "../kselftest.h" - -#ifdef __powerpc64__ -#define PAGE_SIZE (64 << 10) -/* - * This will work with 16M and 2M hugepage size - */ -#define HUGETLB_SIZE (16 << 20) -#elif __aarch64__ -/* - * The default hugepage size for 64k base pagesize - * is 512MB. - */ -#define PAGE_SIZE (64 << 10) -#define HUGETLB_SIZE (512 << 20) -#else -#define PAGE_SIZE (4 << 10) -#define HUGETLB_SIZE (2 << 20) -#endif - -/* - * >= 128TB is the hint addr value we used to select - * large address space. - */ - -#define ADDR_MARK_128TB (1UL << 47) -#define ADDR_MARK_256TB (1UL << 48) - -#define HIGH_ADDR_128TB ((void *) (1UL << 48)) -#define HIGH_ADDR_256TB ((void *) (1UL << 49)) - -#define LOW_ADDR ((void *) (1UL << 30)) - -#ifdef __aarch64__ -#define ADDR_SWITCH_HINT ADDR_MARK_256TB -#define HIGH_ADDR HIGH_ADDR_256TB -#else -#define ADDR_SWITCH_HINT ADDR_MARK_128TB -#define HIGH_ADDR HIGH_ADDR_128TB -#endif - -struct testcase { - void *addr; - unsigned long size; - unsigned long flags; - const char *msg; - unsigned int low_addr_required:1; - unsigned int keep_mapped:1; -}; - -static struct testcase testcases[] = { - { - /* - * If stack is moved, we could possibly allocate - * this at the requested address. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - /* - * We should never allocate at the requested address or above it - * The len cross the 128TB boundary. Without MAP_FIXED - * we will always search in the lower address space. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", - .low_addr_required = 1, - }, - { - /* - * Exact mapping at 128TB, the area is free we should get that - * even without MAP_FIXED. - */ - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, - { - .addr = NULL, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED)", - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1) again", - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, -}; - -static struct testcase hugetlb_testcases[] = { - { - .addr = NULL, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB) again", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", - }, -}; - -static int run_test(struct testcase *test, int count) -{ - void *p; - int i, ret = KSFT_PASS; - - for (i = 0; i < count; i++) { - struct testcase *t = test + i; - - p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); - - printf("%s: %p - ", t->msg, p); - - if (p == MAP_FAILED) { - printf("FAILED\n"); - ret = KSFT_FAIL; - continue; - } - - if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { - printf("FAILED\n"); - ret = KSFT_FAIL; - } else { - /* - * Do a dereference of the address returned so that we catch - * bugs in page fault handling - */ - memset(p, 0, t->size); - printf("OK\n"); - } - if (!t->keep_mapped) - munmap(p, t->size); - } - - return ret; -} - -static int supported_arch(void) -{ -#if defined(__powerpc64__) - return 1; -#elif defined(__x86_64__) - return 1; -#elif defined(__aarch64__) - return 1; -#else - return 0; -#endif -} - -int main(int argc, char **argv) -{ - int ret; - - if (!supported_arch()) - return KSFT_SKIP; - - ret = run_test(testcases, ARRAY_SIZE(testcases)); - if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); - return ret; -} diff --git a/tools/testing/selftests/mm/va_128TBswitch.sh b/tools/testing/selftests/mm/va_128TBswitch.sh deleted file mode 100644 index 41580751dc51..000000000000 --- a/tools/testing/selftests/mm/va_128TBswitch.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2022 Adam Sindelar (Meta) -# -# This is a test for mmap behavior with 5-level paging. This script wraps the -# real test to check that the kernel is configured to support at least 5 -# pagetable levels. - -# 1 means the test failed -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -fail() -{ - echo "$1" - exit $exitcode -} - -check_supported_x86_64() -{ - local config="/proc/config.gz" - [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" - [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" - - # gzip -dcfq automatically handles both compressed and plaintext input. - # See man 1 gzip under '-f'. - local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) - - if [[ "${pg_table_levels}" -lt 5 ]]; then - echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" - exit $ksft_skip - fi -} - -check_test_requirements() -{ - # The test supports x86_64 and powerpc64. We currently have no useful - # eligibility check for powerpc64, and the test itself will reject other - # architectures. - case `uname -m` in - "x86_64") - check_supported_x86_64 - ;; - *) - return 0 - ;; - esac -} - -check_test_requirements -./va_128TBswitch diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c new file mode 100644 index 000000000000..6679213effed --- /dev/null +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Authors: Kirill A. Shutemov + * Authors: Aneesh Kumar K.V + */ + +#include +#include +#include + +#include "../kselftest.h" + +#ifdef __powerpc64__ +#define PAGE_SIZE (64 << 10) +/* + * This will work with 16M and 2M hugepage size + */ +#define HUGETLB_SIZE (16 << 20) +#elif __aarch64__ +/* + * The default hugepage size for 64k base pagesize + * is 512MB. + */ +#define PAGE_SIZE (64 << 10) +#define HUGETLB_SIZE (512 << 20) +#else +#define PAGE_SIZE (4 << 10) +#define HUGETLB_SIZE (2 << 20) +#endif + +/* + * >= 128TB is the hint addr value we used to select + * large address space. + */ + +#define ADDR_MARK_128TB (1UL << 47) +#define ADDR_MARK_256TB (1UL << 48) + +#define HIGH_ADDR_128TB ((void *) (1UL << 48)) +#define HIGH_ADDR_256TB ((void *) (1UL << 49)) + +#define LOW_ADDR ((void *) (1UL << 30)) + +#ifdef __aarch64__ +#define ADDR_SWITCH_HINT ADDR_MARK_256TB +#define HIGH_ADDR HIGH_ADDR_256TB +#else +#define ADDR_SWITCH_HINT ADDR_MARK_128TB +#define HIGH_ADDR HIGH_ADDR_128TB +#endif + +struct testcase { + void *addr; + unsigned long size; + unsigned long flags; + const char *msg; + unsigned int low_addr_required:1; + unsigned int keep_mapped:1; +}; + +static struct testcase testcases[] = { + { + /* + * If stack is moved, we could possibly allocate + * this at the requested address. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + /* + * We should never allocate at the requested address or above it + * The len cross the 128TB boundary. Without MAP_FIXED + * we will always search in the lower address space. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", + .low_addr_required = 1, + }, + { + /* + * Exact mapping at 128TB, the area is free we should get that + * even without MAP_FIXED. + */ + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, + { + .addr = NULL, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED)", + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1) again", + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, +}; + +static struct testcase hugetlb_testcases[] = { + { + .addr = NULL, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB) again", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", + }, +}; + +static int run_test(struct testcase *test, int count) +{ + void *p; + int i, ret = KSFT_PASS; + + for (i = 0; i < count; i++) { + struct testcase *t = test + i; + + p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); + + printf("%s: %p - ", t->msg, p); + + if (p == MAP_FAILED) { + printf("FAILED\n"); + ret = KSFT_FAIL; + continue; + } + + if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { + printf("FAILED\n"); + ret = KSFT_FAIL; + } else { + /* + * Do a dereference of the address returned so that we catch + * bugs in page fault handling + */ + memset(p, 0, t->size); + printf("OK\n"); + } + if (!t->keep_mapped) + munmap(p, t->size); + } + + return ret; +} + +static int supported_arch(void) +{ +#if defined(__powerpc64__) + return 1; +#elif defined(__x86_64__) + return 1; +#elif defined(__aarch64__) + return 1; +#else + return 0; +#endif +} + +int main(int argc, char **argv) +{ + int ret; + + if (!supported_arch()) + return KSFT_SKIP; + + ret = run_test(testcases, ARRAY_SIZE(testcases)); + if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) + ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); + return ret; +} diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh new file mode 100644 index 000000000000..3056788a27ac --- /dev/null +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2022 Adam Sindelar (Meta) +# +# This is a test for mmap behavior with 5-level paging. This script wraps the +# real test to check that the kernel is configured to support at least 5 +# pagetable levels. + +# 1 means the test failed +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +fail() +{ + echo "$1" + exit $exitcode +} + +check_supported_x86_64() +{ + local config="/proc/config.gz" + [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" + [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" + + # gzip -dcfq automatically handles both compressed and plaintext input. + # See man 1 gzip under '-f'. + local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) + + if [[ "${pg_table_levels}" -lt 5 ]]; then + echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" + exit $ksft_skip + fi +} + +check_test_requirements() +{ + # The test supports x86_64 and powerpc64. We currently have no useful + # eligibility check for powerpc64, and the test itself will reject other + # architectures. + case `uname -m` in + "x86_64") + check_supported_x86_64 + ;; + *) + return 0 + ;; + esac +} + +check_test_requirements +./va_high_addr_switch -- cgit v1.2.3 From c2af2a41905efcf662b53f280f8538e5c6bd05f4 Mon Sep 17 00:00:00 2001 From: Chaitanya S Prakash Date: Thu, 23 Mar 2023 16:22:41 +0530 Subject: selftests/mm: add platform independent in code comments The in code comments for the selftest were made on the basis of 128TB switch, an architecture feature specific to PowerPc and x86 platforms. Keeping in mind the support added for arm64 platforms which implements a 256TB switch, a more generic explanation has been provided. Link: https://lkml.kernel.org/r/20230323105243.2807166-4-chaitanyas.prakash@arm.com Signed-off-by: Chaitanya S Prakash Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: Shuah Khan Cc: Mark Rutland Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index 6679213effed..7cfaf4a74c57 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -30,8 +30,8 @@ #endif /* - * >= 128TB is the hint addr value we used to select - * large address space. + * The hint addr value is used to allocate addresses + * beyond the high address switch boundary. */ #define ADDR_MARK_128TB (1UL << 47) @@ -73,9 +73,10 @@ static struct testcase testcases[] = { }, { /* - * We should never allocate at the requested address or above it - * The len cross the 128TB boundary. Without MAP_FIXED - * we will always search in the lower address space. + * Unless MAP_FIXED is specified, allocation based on hint + * addr is never at requested address or above it, which is + * beyond high address switch boundary in this case. Instead, + * a suitable allocation is found in lower address space. */ .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), .size = 2 * PAGE_SIZE, @@ -85,8 +86,8 @@ static struct testcase testcases[] = { }, { /* - * Exact mapping at 128TB, the area is free we should get that - * even without MAP_FIXED. + * Exact mapping at high address switch boundary, should + * be obtained even without MAP_FIXED as area is free. */ .addr = ((void *)(ADDR_SWITCH_HINT)), .size = PAGE_SIZE, -- cgit v1.2.3 From 2f489e2e69463729eefc77da22c163ae63b48b74 Mon Sep 17 00:00:00 2001 From: Chaitanya S Prakash Date: Thu, 23 Mar 2023 16:22:42 +0530 Subject: selftests/mm: configure nr_hugepages for arm64 Arm64 has a default hugepage size of 512MB when CONFIG_ARM64_64K_PAGES=y is enabled. While testing on arm64 platforms having up to 4PB of virtual address space, a minimum of 6 hugepages were required for all test cases to pass. Support for this requirement has been added. Link: https://lkml.kernel.org/r/20230323105243.2807166-5-chaitanyas.prakash@arm.com Signed-off-by: Chaitanya S Prakash Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: Shuah Khan Cc: Mark Rutland Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 83b59ae2d6f0..83c438a18aa7 100644 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -236,8 +236,16 @@ if [ $VADDR64 -ne 0 ]; then CATEGORY="hugevm" run_test ./virtual_address_range echo $prev_policy > /proc/sys/vm/overcommit_memory - # virtual address 128TB switch test + # va high address boundary switch test + ARCH_ARM64="arm64" + prev_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages) + if [ "$ARCH" == "$ARCH_ARM64" ]; then + echo 6 > /proc/sys/vm/nr_hugepages + fi CATEGORY="hugevm" run_test ./va_high_addr_switch.sh + if [ "$ARCH" == "$ARCH_ARM64" ]; then + echo $prev_nr_hugepages > /proc/sys/vm/nr_hugepages + fi fi # VADDR64 # vmalloc stability smoke test -- cgit v1.2.3 From c025da0f14e80028de93f72976a0d81dafc17d03 Mon Sep 17 00:00:00 2001 From: Chaitanya S Prakash Date: Thu, 23 Mar 2023 16:22:43 +0530 Subject: selftests/mm: run hugetlb testcases of va switch The va_high_addr_switch selftest is used to test mmap across 128TB boundary. It divides the selftest cases into two main categories on the basis of size. One set is used to create mappings that are multiples of PAGE_SIZE while the other creates mappings that are multiples of HUGETLB_SIZE. In order to run the hugetlb testcases the binary must be appended with "--run-hugetlb" but the file that used to run the test only invokes the binary, thereby completely skipping the hugetlb testcases. Hence, the required statement has been added. Link: https://lkml.kernel.org/r/20230323105243.2807166-6-chaitanyas.prakash@arm.com Signed-off-by: Chaitanya S Prakash Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: Shuah Khan Cc: Mark Rutland Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index 3056788a27ac..45cae7cab27e 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -52,3 +52,7 @@ check_test_requirements() check_test_requirements ./va_high_addr_switch + +# In order to run hugetlb testcases, "--run-hugetlb" must be appended +# to the binary. +./va_high_addr_switch --run-hugetlb -- cgit v1.2.3 From 3cc0c3738cde66a1eadf977fa7b2fdecbbcf5d4f Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Fri, 14 Apr 2023 02:28:01 +0000 Subject: selftests/memfd: fix test_sysctl sysctl memfd_noexec is pid-namespaced, non-reservable, and inherent to the child process. Move the inherence test from init ns to child ns, so init ns can keep the default value. Link: https://lkml.kernel.org/r/20230414022801.2545257-1-jeffxu@google.com Signed-off-by: Jeff Xu Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202303312259.441e35db-yujie.liu@intel.com Tested-by: Yujie Liu Cc: Daniel Verkamp Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index ae71f15f790d..dba0e8ba002f 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -43,6 +43,9 @@ */ static size_t mfd_def_size = MFD_DEF_SIZE; static const char *memfd_str = MEMFD_STR; +static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *)); +static int newpid_thread_fn2(void *arg); +static void join_newpid_thread(pid_t pid); static ssize_t fd2name(int fd, char *buf, size_t bufsize) { @@ -1111,6 +1114,7 @@ static void test_noexec_seal(void) static void test_sysctl_child(void) { int fd; + int pid; printf("%s sysctl 0\n", memfd_str); sysctl_assert_write("0"); @@ -1129,6 +1133,10 @@ static void test_sysctl_child(void) mfd_def_size, MFD_CLOEXEC | MFD_ALLOW_SEALING); + printf("%s child ns\n", memfd_str); + pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2); + join_newpid_thread(pid); + mfd_assert_mode(fd, 0666); mfd_assert_has_seals(fd, F_SEAL_EXEC); mfd_fail_chmod(fd, 0777); @@ -1206,12 +1214,6 @@ static void test_sysctl(void) int pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn); join_newpid_thread(pid); - - printf("%s child ns\n", memfd_str); - sysctl_assert_write("1"); - - pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2); - join_newpid_thread(pid); } /* -- cgit v1.2.3 From 21337f2af16cb36782ff5031f2553f9cf2f7c787 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 17 Apr 2023 15:53:14 -0400 Subject: selftests/mm: add a few options for uffd-unit-test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Namely: "-f": add a wildcard filter for tests to run "-l": list tests rather than running any "-h": help msg Link: https://lkml.kernel.org/r/20230417195317.898696-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mika Penttilä Cc: Mike Kravetz Cc: Nadav Amit Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 52 ++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index d871bf732e62..452ca05a829d 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -909,28 +909,65 @@ uffd_test_case_t uffd_tests[] = { }, }; +static void usage(const char *prog) +{ + printf("usage: %s [-f TESTNAME]\n", prog); + puts(""); + puts(" -f: test name to filter (e.g., event)"); + puts(" -h: show the help msg"); + puts(" -l: list tests only"); + puts(""); + exit(KSFT_FAIL); +} + int main(int argc, char *argv[]) { int n_tests = sizeof(uffd_tests) / sizeof(uffd_test_case_t); int n_mems = sizeof(mem_types) / sizeof(mem_type_t); + const char *test_filter = NULL; + bool list_only = false; uffd_test_case_t *test; mem_type_t *mem_type; uffd_test_args_t args; char test_name[128]; const char *errmsg; - int has_uffd; + int has_uffd, opt; int i, j; - has_uffd = test_uffd_api(false); - has_uffd |= test_uffd_api(true); + while ((opt = getopt(argc, argv, "f:hl")) != -1) { + switch (opt) { + case 'f': + test_filter = optarg; + break; + case 'l': + list_only = true; + break; + case 'h': + default: + /* Unknown */ + usage(argv[0]); + break; + } + } + + if (!test_filter && !list_only) { + has_uffd = test_uffd_api(false); + has_uffd |= test_uffd_api(true); - if (!has_uffd) { - printf("Userfaultfd not supported or unprivileged, skip all tests\n"); - exit(KSFT_SKIP); + if (!has_uffd) { + printf("Userfaultfd not supported or unprivileged, skip all tests\n"); + exit(KSFT_SKIP); + } } for (i = 0; i < n_tests; i++) { test = &uffd_tests[i]; + if (test_filter && !strstr(test->name, test_filter)) + continue; + if (list_only) { + printf("%s\n", test->name); + continue; + } for (j = 0; j < n_mems; j++) { mem_type = &mem_types[j]; if (!(test->mem_targets & mem_type->mem_flag)) @@ -952,7 +989,8 @@ int main(int argc, char *argv[]) } } - uffd_test_report(); + if (!list_only) + uffd_test_report(); return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS; } -- cgit v1.2.3 From cff294582798dce6ca2b2d08051f563c0f7a09c8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 17 Apr 2023 15:53:15 -0400 Subject: selftests/mm: extend and rename uffd pagemap test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend it to all types of mem, meanwhile add one parallel test when EVENT_FORK is enabled, where uffd-wp bits should be persisted rather than dropped. Since at it, rename the test to "wp-fork" to better show what it means. Making the new test called "wp-fork-with-event". Before: Testing pagemap on anon... done After: Testing wp-fork on anon... done Testing wp-fork on shmem... done Testing wp-fork on shmem-private... done Testing wp-fork on hugetlb... done Testing wp-fork on hugetlb-private... done Testing wp-fork-with-event on anon... done Testing wp-fork-with-event on shmem... done Testing wp-fork-with-event on shmem-private... done Testing wp-fork-with-event on hugetlb... done Testing wp-fork-with-event on hugetlb-private... done Link: https://lkml.kernel.org/r/20230417195317.898696-5-peterx@redhat.com Signed-off-by: Peter Xu Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Mika Penttilä Cc: Mike Kravetz Cc: Nadav Amit Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 130 ++++++++++++++++++++++----- 1 file changed, 106 insertions(+), 24 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 452ca05a829d..739fc4d30342 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -227,25 +227,65 @@ static int pagemap_open(void) err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ } while (0) -static int pagemap_test_fork(bool present) +typedef struct { + int parent_uffd, child_uffd; +} fork_event_args; + +static void *fork_event_consumer(void *data) { - pid_t child = fork(); + fork_event_args *args = data; + struct uffd_msg msg = { 0 }; + + /* Read until a full msg received */ + while (uffd_read_msg(args->parent_uffd, &msg)); + + if (msg.event != UFFD_EVENT_FORK) + err("wrong message: %u\n", msg.event); + + /* Just to be properly freed later */ + args->child_uffd = msg.arg.fork.ufd; + return NULL; +} + +static int pagemap_test_fork(int uffd, bool with_event) +{ + fork_event_args args = { .parent_uffd = uffd, .child_uffd = -1 }; + pthread_t thread; + pid_t child; uint64_t value; int fd, result; + /* Prepare a thread to resolve EVENT_FORK */ + if (with_event) { + if (pthread_create(&thread, NULL, fork_event_consumer, &args)) + err("pthread_create()"); + } + + child = fork(); if (!child) { /* Open the pagemap fd of the child itself */ fd = pagemap_open(); value = pagemap_get_entry(fd, area_dst); /* - * After fork() uffd-wp bit should be gone as long as we're - * without UFFD_FEATURE_EVENT_FORK + * After fork(), we should handle uffd-wp bit differently: + * + * (1) when with EVENT_FORK, it should persist + * (2) when without EVENT_FORK, it should be dropped */ - pagemap_check_wp(value, false); + pagemap_check_wp(value, with_event); /* Succeed */ exit(0); } waitpid(child, &result, 0); + + if (with_event) { + if (pthread_join(thread, NULL)) + err("pthread_join()"); + if (args.child_uffd < 0) + err("Didn't receive child uffd"); + close(args.child_uffd); + } + return result; } @@ -295,7 +335,8 @@ static void uffd_wp_unpopulated_test(uffd_test_args_t *args) uffd_test_pass(); } -static void uffd_pagemap_test(uffd_test_args_t *args) +static void uffd_wp_fork_test_common(uffd_test_args_t *args, + bool with_event) { int pagemap_fd; uint64_t value; @@ -311,23 +352,42 @@ static void uffd_pagemap_test(uffd_test_args_t *args) wp_range(uffd, (uint64_t)area_dst, page_size, true); value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(true)) - err("Detected stall uffd-wp bit in child"); - - /* Exclusive required or PAGEOUT won't work */ - if (!(value & PM_MMAP_EXCLUSIVE)) - err("multiple mapping detected: 0x%"PRIx64, value); + if (pagemap_test_fork(uffd, with_event)) { + uffd_test_fail("Detected %s uffd-wp bit in child in present pte", + with_event ? "missing" : "stall"); + goto out; + } - if (madvise(area_dst, page_size, MADV_PAGEOUT)) - err("madvise(MADV_PAGEOUT) failed"); + /* + * This is an attempt for zapping the pgtable so as to test the + * markers. + * + * For private mappings, PAGEOUT will only work on exclusive ptes + * (PM_MMAP_EXCLUSIVE) which we should satisfy. + * + * For shared, PAGEOUT may not work. Use DONTNEED instead which + * plays a similar role of zapping (rather than freeing the page) + * to expose pte markers. + */ + if (args->mem_type->shared) { + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("MADV_DONTNEED"); + } else { + /* + * NOTE: ignore retval because private-hugetlb doesn't yet + * support swapping, so it could fail. + */ + madvise(area_dst, page_size, MADV_PAGEOUT); + } /* Uffd-wp should persist even swapped out */ value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(false)) - err("Detected stall uffd-wp bit in child"); + if (pagemap_test_fork(uffd, with_event)) { + uffd_test_fail("Detected %s uffd-wp bit in child in zapped pte", + with_event ? "missing" : "stall"); + goto out; + } /* Unprotect; this tests swap pte modifications */ wp_range(uffd, (uint64_t)area_dst, page_size, false); @@ -338,9 +398,21 @@ static void uffd_pagemap_test(uffd_test_args_t *args) *area_dst = 2; value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, false); - - close(pagemap_fd); uffd_test_pass(); +out: + if (uffd_unregister(uffd, area_dst, nr_pages * page_size)) + err("unregister failed"); + close(pagemap_fd); +} + +static void uffd_wp_fork_test(uffd_test_args_t *args) +{ + uffd_wp_fork_test_common(args, false); +} + +static void uffd_wp_fork_with_event_test(uffd_test_args_t *args) +{ + uffd_wp_fork_test_common(args, true); } static void check_memory_contents(char *p) @@ -836,10 +908,20 @@ uffd_test_case_t uffd_tests[] = { .uffd_feature_required = 0, }, { - .name = "pagemap", - .uffd_fn = uffd_pagemap_test, - .mem_targets = MEM_ANON, - .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP, + .name = "wp-fork", + .uffd_fn = uffd_wp_fork_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM, + }, + { + .name = "wp-fork-with-event", + .uffd_fn = uffd_wp_fork_with_event_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | + /* when set, child process should inherit uffd-wp bits */ + UFFD_FEATURE_EVENT_FORK, }, { .name = "wp-unpopulated", -- cgit v1.2.3 From 71fc41eb98357d147fd26d3eb130ca9daf7a9bf3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 17 Apr 2023 15:53:16 -0400 Subject: selftests/mm: rename COW_EXTRA_LIBS to IOURING_EXTRA_LIBS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The macro and facility can be reused in other tests too. Make it general. Link: https://lkml.kernel.org/r/20230417195317.898696-6-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mika Penttilä Cc: Mike Kravetz Cc: Nadav Amit Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 8 ++++---- tools/testing/selftests/mm/check_config.sh | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index de32f7dafa5d..63ed778ca8dc 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -161,8 +161,8 @@ warn_32bit_failure: endif endif -# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. -$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS) +# IOURING_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. +$(OUTPUT)/cow: LDLIBS += $(IOURING_EXTRA_LIBS) $(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap @@ -175,11 +175,11 @@ local_config.mk local_config.h: check_config.sh EXTRA_CLEAN += local_config.mk local_config.h -ifeq ($(COW_EXTRA_LIBS),) +ifeq ($(IOURING_EXTRA_LIBS),) all: warn_missing_liburing warn_missing_liburing: @echo ; \ - echo "Warning: missing liburing support. Some COW tests will be skipped." ; \ + echo "Warning: missing liburing support. Some tests will be skipped." ; \ echo endif diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh index bcba3af0acea..3954f4746161 100644 --- a/tools/testing/selftests/mm/check_config.sh +++ b/tools/testing/selftests/mm/check_config.sh @@ -21,11 +21,11 @@ $CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 if [ -f $tmpfile_o ]; then echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE - echo "COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE + echo "IOURING_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE else echo "// No liburing support found" > $OUTPUT_H_FILE echo "# No liburing support found, so:" > $OUTPUT_MKFILE - echo "COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE + echo "IOURING_EXTRA_LIBS = " >> $OUTPUT_MKFILE fi rm ${tmpname}.* -- cgit v1.2.3 From 760aee0b71e34b4b9535e237347ff31b93961cbf Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 17 Apr 2023 15:53:17 -0400 Subject: selftests/mm: add tests for RO pinning vs fork() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test suite (with 10 more sub-tests) to cover RO pinning against fork() over uffd-wp. It covers both: (1) Early CoW test in fork() when page pinned, (2) page unshare due to RO longterm pin. They are: Testing wp-fork-pin on anon... done Testing wp-fork-pin on shmem... done Testing wp-fork-pin on shmem-private... done Testing wp-fork-pin on hugetlb... done Testing wp-fork-pin on hugetlb-private... done Testing wp-fork-pin-with-event on anon... done Testing wp-fork-pin-with-event on shmem... done Testing wp-fork-pin-with-event on shmem-private... done Testing wp-fork-pin-with-event on hugetlb... done Testing wp-fork-pin-with-event on hugetlb-private... done CONFIG_GUP_TEST needed or they'll be skipped. Testing wp-fork-pin on anon... skipped [reason: Possibly CONFIG_GUP_TEST missing or unprivileged] Note that the major test goal is on private memory, but no hurt to also run all of them over shared because shared memory should work the same. Link: https://lkml.kernel.org/r/20230417195317.898696-7-peterx@redhat.com Signed-off-by: Peter Xu Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Mika Penttilä Cc: Mike Kravetz Cc: Nadav Amit Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 144 ++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 739fc4d30342..269c86768a02 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -7,6 +7,8 @@ #include "uffd-common.h" +#include "../../../../mm/gup_test.h" + #ifdef __NR_userfaultfd /* The unit test doesn't need a large or random size, make it 32MB for now */ @@ -247,7 +249,53 @@ static void *fork_event_consumer(void *data) return NULL; } -static int pagemap_test_fork(int uffd, bool with_event) +typedef struct { + int gup_fd; + bool pinned; +} pin_args; + +/* + * Returns 0 if succeed, <0 for errors. pin_pages() needs to be paired + * with unpin_pages(). Currently it needs to be RO longterm pin to satisfy + * all needs of the test cases (e.g., trigger unshare, trigger fork() early + * CoW, etc.). + */ +static int pin_pages(pin_args *args, void *buffer, size_t size) +{ + struct pin_longterm_test test = { + .addr = (uintptr_t)buffer, + .size = size, + /* Read-only pins */ + .flags = 0, + }; + + if (args->pinned) + err("already pinned"); + + args->gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (args->gup_fd < 0) + return -errno; + + if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_START, &test)) { + /* Even if gup_test existed, can be an old gup_test / kernel */ + close(args->gup_fd); + return -errno; + } + args->pinned = true; + return 0; +} + +static void unpin_pages(pin_args *args) +{ + if (!args->pinned) + err("unpin without pin first"); + if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_STOP)) + err("PIN_LONGTERM_TEST_STOP"); + close(args->gup_fd); + args->pinned = false; +} + +static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) { fork_event_args args = { .parent_uffd = uffd, .child_uffd = -1 }; pthread_t thread; @@ -264,7 +312,17 @@ static int pagemap_test_fork(int uffd, bool with_event) child = fork(); if (!child) { /* Open the pagemap fd of the child itself */ + pin_args args = {}; + fd = pagemap_open(); + + if (test_pin && pin_pages(&args, area_dst, page_size)) + /* + * Normally when reach here we have pinned in + * previous tests, so shouldn't fail anymore + */ + err("pin page failed in child"); + value = pagemap_get_entry(fd, area_dst); /* * After fork(), we should handle uffd-wp bit differently: @@ -273,6 +331,8 @@ static int pagemap_test_fork(int uffd, bool with_event) * (2) when without EVENT_FORK, it should be dropped */ pagemap_check_wp(value, with_event); + if (test_pin) + unpin_pages(&args); /* Succeed */ exit(0); } @@ -352,7 +412,7 @@ static void uffd_wp_fork_test_common(uffd_test_args_t *args, wp_range(uffd, (uint64_t)area_dst, page_size, true); value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, true); - if (pagemap_test_fork(uffd, with_event)) { + if (pagemap_test_fork(uffd, with_event, false)) { uffd_test_fail("Detected %s uffd-wp bit in child in present pte", with_event ? "missing" : "stall"); goto out; @@ -383,7 +443,7 @@ static void uffd_wp_fork_test_common(uffd_test_args_t *args, /* Uffd-wp should persist even swapped out */ value = pagemap_get_entry(pagemap_fd, area_dst); pagemap_check_wp(value, true); - if (pagemap_test_fork(uffd, with_event)) { + if (pagemap_test_fork(uffd, with_event, false)) { uffd_test_fail("Detected %s uffd-wp bit in child in zapped pte", with_event ? "missing" : "stall"); goto out; @@ -415,6 +475,68 @@ static void uffd_wp_fork_with_event_test(uffd_test_args_t *args) uffd_wp_fork_test_common(args, true); } +static void uffd_wp_fork_pin_test_common(uffd_test_args_t *args, + bool with_event) +{ + int pagemap_fd; + pin_args pin_args = {}; + + if (uffd_register(uffd, area_dst, page_size, false, true, false)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, page_size, true); + + /* + * 1. First pin, then fork(). This tests fork() special path when + * doing early CoW if the page is private. + */ + if (pin_pages(&pin_args, area_dst, page_size)) { + uffd_test_skip("Possibly CONFIG_GUP_TEST missing " + "or unprivileged"); + close(pagemap_fd); + uffd_unregister(uffd, area_dst, page_size); + return; + } + + if (pagemap_test_fork(uffd, with_event, false)) { + uffd_test_fail("Detected %s uffd-wp bit in early CoW of fork()", + with_event ? "missing" : "stall"); + unpin_pages(&pin_args); + goto out; + } + + unpin_pages(&pin_args); + + /* + * 2. First fork(), then pin (in the child, where test_pin==true). + * This tests COR, aka, page unsharing on private memories. + */ + if (pagemap_test_fork(uffd, with_event, true)) { + uffd_test_fail("Detected %s uffd-wp bit when RO pin", + with_event ? "missing" : "stall"); + goto out; + } + uffd_test_pass(); +out: + if (uffd_unregister(uffd, area_dst, page_size)) + err("register failed"); + close(pagemap_fd); +} + +static void uffd_wp_fork_pin_test(uffd_test_args_t *args) +{ + uffd_wp_fork_pin_test_common(args, false); +} + +static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t *args) +{ + uffd_wp_fork_pin_test_common(args, true); +} + static void check_memory_contents(char *p) { unsigned long i, j; @@ -923,6 +1045,22 @@ uffd_test_case_t uffd_tests[] = { /* when set, child process should inherit uffd-wp bits */ UFFD_FEATURE_EVENT_FORK, }, + { + .name = "wp-fork-pin", + .uffd_fn = uffd_wp_fork_pin_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM, + }, + { + .name = "wp-fork-pin-with-event", + .uffd_fn = uffd_wp_fork_pin_with_event_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | + /* when set, child process should inherit uffd-wp bits */ + UFFD_FEATURE_EVENT_FORK, + }, { .name = "wp-unpopulated", .uffd_fn = uffd_wp_unpopulated_test, -- cgit v1.2.3 From 07115fcc15b4aa5c268fb80b82ad15868a82a285 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 17 Apr 2023 22:13:42 -0700 Subject: selftests/mm: add new selftests for KSM This adds three new tests to the selftests for KSM. These tests use the new prctl API's to enable and disable KSM. 1) add new prctl flags to prctl header file in tools dir This adds the new prctl flags to the include file prct.h in the tools directory. This makes sure they are available for testing. 2) add KSM prctl merge test to ksm_tests This adds the -t option to the ksm_tests program. The -t flag allows to specify if it should use madvise or prctl ksm merging. 3) add two functions for debugging merge outcome for ksm_tests This adds two functions to report the metrics in /proc/self/ksm_stat and /sys/kernel/debug/mm/ksm. The debug output is enabled with the -d option. 4) add KSM prctl test to ksm_functional_tests This adds a test to the ksm_functional_test that verifies that the prctl system call to enable / disable KSM works. 5) add KSM fork test to ksm_functional_test Add fork test to verify that the MMF_VM_MERGE_ANY flag is inherited by the child process. Link: https://lkml.kernel.org/r/20230418051342.1919757-4-shr@devkernel.io Signed-off-by: Stefan Roesch Acked-by: David Hildenbrand Cc: Bagas Sanjaya Cc: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton --- tools/include/uapi/linux/prctl.h | 2 + tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/ksm_functional_tests.c | 91 +++++++++++- tools/testing/selftests/mm/ksm_tests.c | 172 +++++++++++++++++----- 4 files changed, 228 insertions(+), 39 deletions(-) (limited to 'tools/testing') diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 1312a137f7fb..759b3f53e53f 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -290,4 +290,6 @@ struct prctl_mm_map { #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 +#define PR_SET_MEMORY_MERGE 67 +#define PR_GET_MEMORY_MERGE 68 #endif /* _LINUX_PRCTL_H */ diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 63ed778ca8dc..f764f2b3e34b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -29,7 +29,7 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/p # LDLIBS. MAKEFLAGS += --no-builtin-rules -CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) +CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/tools/include/uapi $(EXTRA_CFLAGS) $(KHDR_INCLUDES) LDLIBS = -lrt -lpthread TEST_GEN_PROGS = cow diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index d3f26050dfd7..7bc9fc17c9f0 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -15,8 +15,10 @@ #include #include #include +#include #include #include +#include #include #include "../kselftest.h" @@ -237,9 +239,93 @@ unmap: } #endif +/* Verify that KSM can be enabled / queried with prctl. */ +static void test_prctl(void) +{ + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno == EINVAL) { + ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + return; + } else if (ret) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + return; + } + + ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0); + if (ret < 0) { + ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n"); + return; + } else if (ret != 1) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 not effective\n"); + return; + } + + ret = prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); + if (ret) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n"); + return; + } + + ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0); + if (ret < 0) { + ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n"); + return; + } else if (ret != 0) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 not effective\n"); + return; + } + + ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n"); +} + +/* Verify that prctl ksm flag is inherited. */ +static void test_prctl_fork(void) +{ + int ret, status; + pid_t child_pid; + + ksft_print_msg("[RUN] %s\n", __func__); + + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno == EINVAL) { + ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + return; + } else if (ret) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + return; + } + + child_pid = fork(); + if (!child_pid) { + exit(prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0)); + } else if (child_pid < 0) { + ksft_test_result_fail("fork() failed\n"); + return; + } + + if (waitpid(child_pid, &status, 0) < 0) { + ksft_test_result_fail("waitpid() failed\n"); + return; + } else if (WEXITSTATUS(status) != 1) { + ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + return; + } + + if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n"); + return; + } + + ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); +} + int main(int argc, char **argv) { - unsigned int tests = 2; + unsigned int tests = 4; int err; #ifdef __NR_userfaultfd @@ -267,6 +353,9 @@ int main(int argc, char **argv) test_unmerge_uffd_wp(); #endif + test_prctl(); + test_prctl_fork(); + err = ksft_get_fail_cnt(); if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c index 74281593a124..435acebdc325 100644 --- a/tools/testing/selftests/mm/ksm_tests.c +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include +#include #include #include #include @@ -21,6 +23,7 @@ #define KSM_PROT_STR_DEFAULT "rw" #define KSM_USE_ZERO_PAGES_DEFAULT false #define KSM_MERGE_ACROSS_NODES_DEFAULT true +#define KSM_MERGE_TYPE_DEFAULT 0 #define MB (1ul << 20) struct ksm_sysfs { @@ -33,9 +36,16 @@ struct ksm_sysfs { unsigned long use_zero_pages; }; +enum ksm_merge_type { + KSM_MERGE_MADVISE, + KSM_MERGE_PRCTL, + KSM_MERGE_LAST = KSM_MERGE_PRCTL +}; + enum ksm_test_name { CHECK_KSM_MERGE, CHECK_KSM_UNMERGE, + CHECK_KSM_GET_MERGE_TYPE, CHECK_KSM_ZERO_PAGE_MERGE, CHECK_KSM_NUMA_MERGE, KSM_MERGE_TIME, @@ -44,6 +54,8 @@ enum ksm_test_name { KSM_COW_TIME }; +int debug; + static int ksm_write_sysfs(const char *file_path, unsigned long val) { FILE *f = fopen(file_path, "w"); @@ -82,6 +94,53 @@ static int ksm_read_sysfs(const char *file_path, unsigned long *val) return 0; } +static void ksm_print_sysfs(void) +{ + unsigned long max_page_sharing, pages_sharing, pages_shared; + unsigned long full_scans, pages_unshared, pages_volatile; + unsigned long stable_node_chains, stable_node_dups; + long general_profit; + + if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || + ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || + ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing) || + ksm_read_sysfs(KSM_FP("full_scans"), &full_scans) || + ksm_read_sysfs(KSM_FP("pages_unshared"), &pages_unshared) || + ksm_read_sysfs(KSM_FP("pages_volatile"), &pages_volatile) || + ksm_read_sysfs(KSM_FP("stable_node_chains"), &stable_node_chains) || + ksm_read_sysfs(KSM_FP("stable_node_dups"), &stable_node_dups) || + ksm_read_sysfs(KSM_FP("general_profit"), (unsigned long *)&general_profit)) + return; + + printf("pages_shared : %lu\n", pages_shared); + printf("pages_sharing : %lu\n", pages_sharing); + printf("max_page_sharing : %lu\n", max_page_sharing); + printf("full_scans : %lu\n", full_scans); + printf("pages_unshared : %lu\n", pages_unshared); + printf("pages_volatile : %lu\n", pages_volatile); + printf("stable_node_chains: %lu\n", stable_node_chains); + printf("stable_node_dups : %lu\n", stable_node_dups); + printf("general_profit : %ld\n", general_profit); +} + +static void ksm_print_procfs(void) +{ + const char *file_name = "/proc/self/ksm_stat"; + char buffer[512]; + FILE *f = fopen(file_name, "r"); + + if (!f) { + fprintf(stderr, "f %s\n", file_name); + perror("fopen"); + return; + } + + while (fgets(buffer, sizeof(buffer), f)) + printf("%s", buffer); + + fclose(f); +} + static int str_to_prot(char *prot_str) { int prot = 0; @@ -128,7 +187,12 @@ static void print_help(void) " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); printf(" -m: change merge_across_nodes tunable\n" " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); + printf(" -d: turn debugging output on\n"); printf(" -s: the size of duplicated memory area (in MiB)\n"); + printf(" -t: KSM merge type\n" + " Default: 0\n" + " 0: madvise merging\n" + " 1: prctl merging\n"); exit(0); } @@ -176,12 +240,21 @@ static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) return 0; } -static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) +static int ksm_merge_pages(int merge_type, void *addr, size_t size, + struct timespec start_time, int timeout) { - if (madvise(addr, size, MADV_MERGEABLE)) { - perror("madvise"); - return 1; + if (merge_type == KSM_MERGE_MADVISE) { + if (madvise(addr, size, MADV_MERGEABLE)) { + perror("madvise"); + return 1; + } + } else if (merge_type == KSM_MERGE_PRCTL) { + if (prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0)) { + perror("prctl"); + return 1; + } } + if (ksm_write_sysfs(KSM_FP("run"), 1)) return 1; @@ -211,6 +284,11 @@ static bool assert_ksm_pages_count(long dupl_page_count) ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) return false; + if (debug) { + ksm_print_sysfs(); + ksm_print_procfs(); + } + /* * Since there must be at least 2 pages for merging and 1 page can be * shared with the limited number of pages (max_page_sharing), sometimes @@ -266,7 +344,8 @@ static int ksm_restore(struct ksm_sysfs *ksm_sysfs) return 0; } -static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) +static int check_ksm_merge(int merge_type, int mapping, int prot, + long page_count, int timeout, size_t page_size) { void *map_ptr; struct timespec start_time; @@ -281,13 +360,15 @@ static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, if (!map_ptr) return KSFT_FAIL; - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout)) goto err_out; /* verify that the right number of pages are merged */ if (assert_ksm_pages_count(page_count)) { printf("OK\n"); munmap(map_ptr, page_size * page_count); + if (merge_type == KSM_MERGE_PRCTL) + prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); return KSFT_PASS; } @@ -297,7 +378,7 @@ err_out: return KSFT_FAIL; } -static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) +static int check_ksm_unmerge(int merge_type, int mapping, int prot, int timeout, size_t page_size) { void *map_ptr; struct timespec start_time; @@ -313,7 +394,7 @@ static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_siz if (!map_ptr) return KSFT_FAIL; - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout)) goto err_out; /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ @@ -337,8 +418,8 @@ err_out: return KSFT_FAIL; } -static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, - bool use_zero_pages, size_t page_size) +static int check_ksm_zero_page_merge(int merge_type, int mapping, int prot, long page_count, + int timeout, bool use_zero_pages, size_t page_size) { void *map_ptr; struct timespec start_time; @@ -356,7 +437,7 @@ static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int if (!map_ptr) return KSFT_FAIL; - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout)) goto err_out; /* @@ -402,8 +483,8 @@ static int get_first_mem_node(void) return get_next_mem_node(numa_max_node()); } -static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, - size_t page_size) +static int check_ksm_numa_merge(int merge_type, int mapping, int prot, int timeout, + bool merge_across_nodes, size_t page_size) { void *numa1_map_ptr, *numa2_map_ptr; struct timespec start_time; @@ -439,8 +520,8 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a memset(numa2_map_ptr, '*', page_size); /* try to merge the pages */ - if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || - ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) + if (ksm_merge_pages(merge_type, numa1_map_ptr, page_size, start_time, timeout) || + ksm_merge_pages(merge_type, numa2_map_ptr, page_size, start_time, timeout)) goto err_out; /* @@ -466,7 +547,8 @@ err_out: return KSFT_FAIL; } -static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) +static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot, + int timeout, size_t map_size) { void *map_ptr, *map_ptr_orig; struct timespec start_time, end_time; @@ -508,7 +590,7 @@ static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t m perror("clock_gettime"); goto err_out; } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { perror("clock_gettime"); @@ -533,7 +615,7 @@ err_out: return KSFT_FAIL; } -static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) +static int ksm_merge_time(int merge_type, int mapping, int prot, int timeout, size_t map_size) { void *map_ptr; struct timespec start_time, end_time; @@ -549,7 +631,7 @@ static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) perror("clock_gettime"); goto err_out; } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { perror("clock_gettime"); @@ -574,7 +656,7 @@ err_out: return KSFT_FAIL; } -static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) +static int ksm_unmerge_time(int merge_type, int mapping, int prot, int timeout, size_t map_size) { void *map_ptr; struct timespec start_time, end_time; @@ -589,7 +671,7 @@ static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) perror("clock_gettime"); goto err_out; } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { @@ -621,7 +703,7 @@ err_out: return KSFT_FAIL; } -static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) +static int ksm_cow_time(int merge_type, int mapping, int prot, int timeout, size_t page_size) { void *map_ptr; struct timespec start_time, end_time; @@ -660,7 +742,7 @@ static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) memset(map_ptr + page_size * i, '+', i / 2 + 1); memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); } - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { @@ -697,6 +779,7 @@ int main(int argc, char *argv[]) int ret, opt; int prot = 0; int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; + int merge_type = KSM_MERGE_TYPE_DEFAULT; long page_count = KSM_PAGE_COUNT_DEFAULT; size_t page_size = sysconf(_SC_PAGESIZE); struct ksm_sysfs ksm_sysfs_old; @@ -705,7 +788,7 @@ int main(int argc, char *argv[]) bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; long size_MB = 0; - while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) { + while ((opt = getopt(argc, argv, "dha:p:l:z:m:s:t:MUZNPCHD")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -739,12 +822,26 @@ int main(int argc, char *argv[]) else merge_across_nodes = 1; break; + case 'd': + debug = 1; + break; case 's': size_MB = atoi(optarg); if (size_MB <= 0) { printf("Size must be greater than 0\n"); return KSFT_FAIL; } + case 't': + { + int tmp = atoi(optarg); + + if (tmp < 0 || tmp > KSM_MERGE_LAST) { + printf("Invalid merge type\n"); + return KSFT_FAIL; + } + merge_type = tmp; + } + break; case 'M': break; case 'U': @@ -795,35 +892,36 @@ int main(int argc, char *argv[]) switch (test_name) { case CHECK_KSM_MERGE: - ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ret = check_ksm_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, ksm_scan_limit_sec, page_size); break; case CHECK_KSM_UNMERGE: - ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - page_size); + ret = check_ksm_unmerge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, page_size); break; case CHECK_KSM_ZERO_PAGE_MERGE: - ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, - ksm_scan_limit_sec, use_zero_pages, page_size); + ret = check_ksm_zero_page_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + page_count, ksm_scan_limit_sec, use_zero_pages, + page_size); break; case CHECK_KSM_NUMA_MERGE: - ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - merge_across_nodes, page_size); + ret = check_ksm_numa_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, merge_across_nodes, page_size); break; case KSM_MERGE_TIME: if (size_MB == 0) { printf("Option '-s' is required.\n"); return KSFT_FAIL; } - ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - size_MB); + ret = ksm_merge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); break; case KSM_MERGE_TIME_HUGE_PAGES: if (size_MB == 0) { printf("Option '-s' is required.\n"); return KSFT_FAIL; } - ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ret = ksm_merge_hugepages_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; case KSM_UNMERGE_TIME: @@ -831,12 +929,12 @@ int main(int argc, char *argv[]) printf("Option '-s' is required.\n"); return KSFT_FAIL; } - ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ret = ksm_unmerge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; case KSM_COW_TIME: - ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - page_size); + ret = ksm_cow_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, page_size); break; } -- cgit v1.2.3