summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt8
-rw-r--r--Documentation/admin-guide/mm/cma_debugfs.rst10
-rw-r--r--Documentation/admin-guide/mm/userfaultfd.rst41
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst11
-rw-r--r--Documentation/admin-guide/sysctl/vm.rst3
-rw-r--r--Documentation/mm/page_owner.rst5
-rw-r--r--arch/alpha/include/uapi/asm/mman.h2
-rw-r--r--arch/arc/Kconfig2
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/configs/imx_v6_v7_defconfig2
-rw-r--r--arch/arm/configs/milbeaut_m10v_defconfig2
-rw-r--r--arch/arm/configs/oxnas_v6_defconfig2
-rw-r--r--arch/arm/configs/pxa_defconfig2
-rw-r--r--arch/arm/configs/sama7_defconfig2
-rw-r--r--arch/arm/configs/sp7021_defconfig2
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/csky/Kconfig2
-rw-r--r--arch/ia64/Kconfig2
-rw-r--r--arch/ia64/include/asm/sparsemem.h6
-rw-r--r--arch/loongarch/Kconfig2
-rw-r--r--arch/m68k/Kconfig.cpu2
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/mips/include/uapi/asm/mman.h2
-rw-r--r--arch/nios2/Kconfig2
-rw-r--r--arch/parisc/include/uapi/asm/mman.h2
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/configs/85xx/ge_imp3a_defconfig2
-rw-r--r--arch/powerpc/configs/fsl-emb-nonhw.config2
-rw-r--r--arch/s390/mm/hugetlbpage.c10
-rw-r--r--arch/sh/configs/ecovec24_defconfig2
-rw-r--r--arch/sh/mm/Kconfig2
-rw-r--r--arch/sparc/Kconfig2
-rw-r--r--arch/xtensa/Kconfig2
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h2
-rw-r--r--drivers/base/memory.c6
-rw-r--r--drivers/block/zram/zram_drv.c27
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/btrfs/compression.c31
-rw-r--r--fs/btrfs/extent_io.c33
-rw-r--r--fs/btrfs/subpage.c2
-rw-r--r--fs/btrfs/tests/extent-io-tests.c32
-rw-r--r--fs/buffer.c154
-rw-r--r--fs/ext2/balloc.c7
-rw-r--r--fs/gfs2/meta_io.c7
-rw-r--r--fs/gfs2/quota.c8
-rw-r--r--fs/isofs/compress.c2
-rw-r--r--fs/jbd2/journal.c15
-rw-r--r--fs/jbd2/recovery.c16
-rw-r--r--fs/nilfs2/page.c45
-rw-r--r--fs/ntfs3/inode.c7
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/ramfs/file-nommu.c50
-rw-r--r--fs/reiserfs/journal.c11
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/inode.c8
-rw-r--r--fs/ufs/balloc.c12
-rw-r--r--fs/userfaultfd.c71
-rw-r--r--include/linux/buffer_head.h48
-rw-r--r--include/linux/gfp.h3
-rw-r--r--include/linux/huge_mm.h23
-rw-r--r--include/linux/hugetlb.h4
-rw-r--r--include/linux/hugetlb_cgroup.h19
-rw-r--r--include/linux/memcontrol.h7
-rw-r--r--include/linux/memory_hotplug.h22
-rw-r--r--include/linux/mempolicy.h13
-rw-r--r--include/linux/mm.h35
-rw-r--r--include/linux/mmzone.h28
-rw-r--r--include/linux/page_counter.h35
-rw-r--r--include/linux/page_ext.h24
-rw-r--r--include/linux/page_idle.h34
-rw-r--r--include/linux/pagemap.h4
-rw-r--r--include/linux/pagewalk.h10
-rw-r--r--include/linux/pgtable.h9
-rw-r--r--include/linux/rmap.h66
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/slab.h6
-rw-r--r--include/trace/events/huge_memory.h1
-rw-r--r--include/uapi/asm-generic/mman-common.h2
-rw-r--r--include/uapi/linux/userfaultfd.h4
-rw-r--r--init/main.c6
-rw-r--r--kernel/sched/core.c14
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c164
-rw-r--r--kernel/sched/sched.h1
-rw-r--r--kernel/sysctl.c8
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/backing-dev.c6
-rw-r--r--mm/cma_debug.c5
-rw-r--r--mm/compaction.c14
-rw-r--r--mm/damon/core-test.h6
-rw-r--r--mm/damon/core.c21
-rw-r--r--mm/damon/dbgfs.c2
-rw-r--r--mm/damon/ops-common.c46
-rw-r--r--mm/damon/paddr.c5
-rw-r--r--mm/damon/vaddr.c31
-rw-r--r--mm/filemap.c87
-rw-r--r--mm/gup.c215
-rw-r--r--mm/huge_memory.c69
-rw-r--r--mm/hugetlb.c137
-rw-r--r--mm/hugetlb_cgroup.c27
-rw-r--r--mm/hugetlb_vmemmap.c10
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kfence/core.c7
-rw-r--r--mm/khugepaged.c763
-rw-r--r--mm/kmemleak.c21
-rw-r--r--mm/ksm.c11
-rw-r--r--mm/madvise.c9
-rw-r--r--mm/memcontrol.c11
-rw-r--r--mm/memory-failure.c80
-rw-r--r--mm/memory.c15
-rw-r--r--mm/memory_hotplug.c5
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/migrate.c22
-rw-r--r--mm/migrate_device.c13
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/page_alloc.c39
-rw-r--r--mm/page_counter.c15
-rw-r--r--mm/page_ext.c117
-rw-r--r--mm/page_io.c9
-rw-r--r--mm/page_owner.c97
-rw-r--r--mm/page_table_check.c10
-rw-r--r--mm/pagewalk.c10
-rw-r--r--mm/rmap.c26
-rw-r--r--mm/slub.c26
-rw-r--r--mm/swap.h4
-rw-r--r--mm/util.c4
-rw-r--r--mm/vmalloc.c15
-rw-r--r--mm/vmscan.c45
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/zsmalloc.c14
-rw-r--r--mm/zswap.c2
-rw-r--r--tools/include/uapi/asm-generic/mman-common.h2
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile21
-rw-r--r--tools/testing/selftests/vm/check_config.sh31
-rw-r--r--tools/testing/selftests/vm/hmm-tests.c108
-rw-r--r--tools/testing/selftests/vm/khugepaged.c563
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests.sh15
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c76
-rw-r--r--tools/vm/page_owner_sort.c7
147 files changed, 2603 insertions, 1625 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 426fa892d311..3b95f65bafe2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1471,6 +1471,14 @@
Permit 'security.evm' to be updated regardless of
current integrity status.
+ early_page_ext [KNL] Enforces page_ext initialization to earlier
+ stages so cover more early boot allocations.
+ Please note that as side effect some optimizations
+ might be disabled to achieve that (e.g. parallelized
+ memory initialization is disabled) so the boot process
+ might take longer, especially on systems with a lot of
+ memory. Available with CONFIG_PAGE_EXTENSION=y.
+
failslab=
fail_usercopy=
fail_page_alloc=
diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst
index 4e06ffabd78a..7367e6294ef6 100644
--- a/Documentation/admin-guide/mm/cma_debugfs.rst
+++ b/Documentation/admin-guide/mm/cma_debugfs.rst
@@ -5,10 +5,10 @@ CMA Debugfs Interface
The CMA debugfs interface is useful to retrieve basic information out of the
different CMA areas and to test allocation/release in each of the areas.
-Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
-kernel's CMA index. So the first CMA zone would be:
+Each CMA area represents a directory under <debugfs>/cma/, represented by
+its CMA name like below:
- <debugfs>/cma/cma-0
+ <debugfs>/cma/<cma_name>
The structure of the files created under that directory is as follows:
@@ -18,8 +18,8 @@ The structure of the files created under that directory is as follows:
- [RO] bitmap: The bitmap of page states in the zone.
- [WO] alloc: Allocate N pages from that CMA area. For example::
- echo 5 > <debugfs>/cma/cma-2/alloc
+ echo 5 > <debugfs>/cma/<cma_name>/alloc
-would try to allocate 5 pages from the cma-2 area.
+would try to allocate 5 pages from the 'cma_name' area.
- [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 6528036093e1..83f31919ebb3 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -17,7 +17,10 @@ of the ``PROT_NONE+SIGSEGV`` trick.
Design
======
-Userfaults are delivered and resolved through the ``userfaultfd`` syscall.
+Userspace creates a new userfaultfd, initializes it, and registers one or more
+regions of virtual memory with it. Then, any page faults which occur within the
+region(s) result in a message being delivered to the userfaultfd, notifying
+userspace of the fault.
The ``userfaultfd`` (aside from registering and unregistering virtual
memory ranges) provides two primary functionalities:
@@ -34,12 +37,11 @@ The real advantage of userfaults if compared to regular virtual memory
management of mremap/mprotect is that the userfaults in all their
operations never involve heavyweight structures like vmas (in fact the
``userfaultfd`` runtime load never takes the mmap_lock for writing).
-
Vmas are not suitable for page- (or hugepage) granular fault tracking
when dealing with virtual address spaces that could span
Terabytes. Too many vmas would be needed for that.
-The ``userfaultfd`` once opened by invoking the syscall, can also be
+The ``userfaultfd``, once created, can also be
passed using unix domain sockets to a manager process, so the same
manager process could handle the userfaults of a multitude of
different processes without them being aware about what is going on
@@ -50,6 +52,39 @@ is a corner case that would currently return ``-EBUSY``).
API
===
+Creating a userfaultfd
+----------------------
+
+There are two ways to create a new userfaultfd, each of which provide ways to
+restrict access to this functionality (since historically userfaultfds which
+handle kernel page faults have been a useful tool for exploiting the kernel).
+
+The first way, supported since userfaultfd was introduced, is the
+userfaultfd(2) syscall. Access to this is controlled in several ways:
+
+- Any user can always create a userfaultfd which traps userspace page faults
+ only. Such a userfaultfd can be created using the userfaultfd(2) syscall
+ with the flag UFFD_USER_MODE_ONLY.
+
+- In order to also trap kernel page faults for the address space, either the
+ process needs the CAP_SYS_PTRACE capability, or the system must have
+ vm.unprivileged_userfaultfd set to 1. By default, vm.unprivileged_userfaultfd
+ is set to 0.
+
+The second way, added to the kernel more recently, is by opening
+/dev/userfaultfd and issuing a USERFAULTFD_IOC_NEW ioctl to it. This method
+yields equivalent userfaultfds to the userfaultfd(2) syscall.
+
+Unlike userfaultfd(2), access to /dev/userfaultfd is controlled via normal
+filesystem permissions (user/group/mode), which gives fine grained access to
+userfaultfd specifically, without also granting other unrelated privileges at
+the same time (as e.g. granting CAP_SYS_PTRACE would do). Users who have access
+to /dev/userfaultfd can always create userfaultfds that trap kernel page faults;
+vm.unprivileged_userfaultfd is not considered.
+
+Initializing a userfaultfd
+--------------------------
+
When first opened the ``userfaultfd`` must be enabled invoking the
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
a later API version) which will specify the ``read/POLLIN`` protocol
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index ee6572b1edad..835c8844bba4 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -635,6 +635,17 @@ different types of memory (represented as different NUMA nodes) to
place the hot pages in the fast memory. This is implemented based on
unmapping and page fault too.
+numa_balancing_promote_rate_limit_MBps
+======================================
+
+Too high promotion/demotion throughput between different memory types
+may hurt application latency. This can be used to rate limit the
+promotion throughput. The per-node max promotion throughput in MB/s
+will be limited to be no more than the set value.
+
+A rule of thumb is to set this to less than 1/10 of the PMEM node
+write bandwidth.
+
oops_all_cpu_backtrace
======================
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 9b833e439f09..988f6a4c8084 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -926,6 +926,9 @@ calls without any restrictions.
The default value is 0.
+Another way to control permissions for userfaultfd is to use
+/dev/userfaultfd instead of userfaultfd(2). See
+Documentation/admin-guide/mm/userfaultfd.rst.
user_reserve_kbytes
===================
diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
index f5c954afe97c..f18fd8907049 100644
--- a/Documentation/mm/page_owner.rst
+++ b/Documentation/mm/page_owner.rst
@@ -94,6 +94,11 @@ Usage
Page allocated via order XXX, ...
PFN XXX ...
// Detailed stack
+ By default, it will do full pfn dump, to start with a given pfn,
+ page_owner supports fseek.
+
+ FILE *fp = fopen("/sys/kernel/debug/page_owner", "r");
+ fseek(fp, pfn_start, SEEK_SET);
The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
in buf, uses regexp to extract the page order value, counts the times
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 4aa996423b0d..763929e814e9 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -76,6 +76,8 @@
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
+#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 9e3653253ef2..d9a13ccf89a3 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -554,7 +554,7 @@ config ARC_BUILTIN_DTB_NAME
endmenu # "ARC Architecture Configuration"
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
default "12" if ARC_HUGEPAGE_16M
default "11"
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 87badeae3181..e6c8ee56ac52 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1434,7 +1434,7 @@ config ARM_MODULE_PLTS
Disabling this is usually safe for small single-platform
configurations. If unsure, say y.
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
default "12" if SOC_AM33XX
default "9" if SA1111
diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig
index 01012537a9b9..fb283059daa0 100644
--- a/arch/arm/configs/imx_v6_v7_defconfig
+++ b/arch/arm/configs/imx_v6_v7_defconfig
@@ -31,7 +31,7 @@ CONFIG_SOC_VF610=y
CONFIG_SMP=y
CONFIG_ARM_PSCI=y
CONFIG_HIGHMEM=y
-CONFIG_FORCE_MAX_ZONEORDER=14
+CONFIG_ARCH_FORCE_MAX_ORDER=14
CONFIG_CMDLINE="noinitrd console=ttymxc0,115200"
CONFIG_KEXEC=y
CONFIG_CPU_FREQ=y
diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig
index 58810e98de3d..8620061e19a8 100644
--- a/arch/arm/configs/milbeaut_m10v_defconfig
+++ b/arch/arm/configs/milbeaut_m10v_defconfig
@@ -26,7 +26,7 @@ CONFIG_THUMB2_KERNEL=y
# CONFIG_THUMB2_AVOID_R_ARM_THM_JUMP11 is not set
# CONFIG_ARM_PATCH_IDIV is not set
CONFIG_HIGHMEM=y
-CONFIG_FORCE_MAX_ZONEORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=12
CONFIG_SECCOMP=y
CONFIG_KEXEC=y
CONFIG_EFI=y
diff --git a/arch/arm/configs/oxnas_v6_defconfig b/arch/arm/configs/oxnas_v6_defconfig
index 600f78b363dd..5c163a9d1429 100644
--- a/arch/arm/configs/oxnas_v6_defconfig
+++ b/arch/arm/configs/oxnas_v6_defconfig
@@ -12,7 +12,7 @@ CONFIG_ARCH_OXNAS=y
CONFIG_MACH_OX820=y
CONFIG_SMP=y
CONFIG_NR_CPUS=16
-CONFIG_FORCE_MAX_ZONEORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=12
CONFIG_SECCOMP=y
CONFIG_ARM_APPENDED_DTB=y
CONFIG_ARM_ATAG_DTB_COMPAT=y
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index 104a45722799..ce3f4ed50498 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -21,7 +21,7 @@ CONFIG_MACH_AKITA=y
CONFIG_MACH_BORZOI=y
CONFIG_PXA_SYSTEMS_CPLDS=y
CONFIG_AEABI=y
-CONFIG_FORCE_MAX_ZONEORDER=9
+CONFIG_ARCH_FORCE_MAX_ORDER=9
CONFIG_CMDLINE="root=/dev/ram0 ro"
CONFIG_KEXEC=y
CONFIG_CPU_FREQ=y
diff --git a/arch/arm/configs/sama7_defconfig b/arch/arm/configs/sama7_defconfig
index 0384030d8b25..8b2cf6ddd568 100644
--- a/arch/arm/configs/sama7_defconfig
+++ b/arch/arm/configs/sama7_defconfig
@@ -19,7 +19,7 @@ CONFIG_ATMEL_CLOCKSOURCE_TCB=y
# CONFIG_CACHE_L2X0 is not set
# CONFIG_ARM_PATCH_IDIV is not set
# CONFIG_CPU_SW_DOMAIN_PAN is not set
-CONFIG_FORCE_MAX_ZONEORDER=15
+CONFIG_ARCH_FORCE_MAX_ORDER=15
CONFIG_UACCESS_WITH_MEMCPY=y
# CONFIG_ATAGS is not set
CONFIG_CMDLINE="console=ttyS0,115200 earlyprintk ignore_loglevel"
diff --git a/arch/arm/configs/sp7021_defconfig b/arch/arm/configs/sp7021_defconfig
index 703b9aaa40f0..151ca8c47373 100644
--- a/arch/arm/configs/sp7021_defconfig
+++ b/arch/arm/configs/sp7021_defconfig
@@ -18,7 +18,7 @@ CONFIG_ARCH_SUNPLUS=y
# CONFIG_VDSO is not set
CONFIG_SMP=y
CONFIG_THUMB2_KERNEL=y
-CONFIG_FORCE_MAX_ZONEORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=12
CONFIG_VFP=y
CONFIG_NEON=y
CONFIG_MODULES=y
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9fb9fff08c94..c5c7d812704c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1418,7 +1418,7 @@ config XEN
help
Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64.
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int
default "14" if ARM64_64K_PAGES
default "12" if ARM64_16K_PAGES
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 3cbc2dc62baf..adee6ab36862 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -332,7 +332,7 @@ config HIGHMEM
select KMAP_LOCAL
default y
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
default "11"
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 26ac8ea15a9e..c6e06cdc738f 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -200,7 +200,7 @@ config IA64_CYCLONE
Say Y here to enable support for IBM EXA Cyclone time source.
If you're unsure, answer N.
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "MAX_ORDER (11 - 17)" if !HUGETLB_PAGE
range 11 17 if !HUGETLB_PAGE
default "17" if HUGETLB_PAGE
diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h
index 42ed5248fae9..84e8ce387b69 100644
--- a/arch/ia64/include/asm/sparsemem.h
+++ b/arch/ia64/include/asm/sparsemem.h
@@ -11,10 +11,10 @@
#define SECTION_SIZE_BITS (30)
#define MAX_PHYSMEM_BITS (50)
-#ifdef CONFIG_FORCE_MAX_ZONEORDER
-#if ((CONFIG_FORCE_MAX_ZONEORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS)
+#ifdef CONFIG_ARCH_FORCE_MAX_ORDER
+#if ((CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS)
#undef SECTION_SIZE_BITS
-#define SECTION_SIZE_BITS (CONFIG_FORCE_MAX_ZONEORDER - 1 + PAGE_SHIFT)
+#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT)
#endif
#endif
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 26aeb1408e56..3c7a5a54b808 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -370,7 +370,7 @@ config NODES_SHIFT
default "6"
depends on NUMA
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
range 14 64 if PAGE_SIZE_64KB
default "14" if PAGE_SIZE_64KB
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index e0e9e31339c1..3b2f39508524 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -399,7 +399,7 @@ config SINGLE_MEMORY_CHUNK
order" to save memory that could be wasted for unused memory map.
Say N if not sure.
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order" if ADVANCED
depends on !SINGLE_MEMORY_CHUNK
default "11"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ec21f8999249..70d28976a40d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2140,7 +2140,7 @@ config PAGE_SIZE_64KB
endchoice
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
range 14 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
default "14" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 1be428663c10..c6e1fc77c996 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -103,6 +103,8 @@
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
+#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4167f1eb4cd8..a582f72104f3 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -44,7 +44,7 @@ menu "Kernel features"
source "kernel/Kconfig.hz"
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
range 9 20
default "11"
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index a7ea3204a5fa..22133a6a506e 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -70,6 +70,8 @@
#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */
+#define MADV_COLLAPSE 73 /* Synchronous hugepage collapse */
+
#define MADV_HWPOISON 100 /* poison a page for testing */
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 4c466acdc70d..39d71d7701bd 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -845,7 +845,7 @@ config DATA_SHIFT
in that case. If PIN_TLB is selected, it must be aligned to 8M as
8M pages will be pinned.
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
range 8 9 if PPC64 && PPC_64K_PAGES
default "9" if PPC64 && PPC_64K_PAGES
diff --git a/arch/powerpc/configs/85xx/ge_imp3a_defconfig b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
index f29c166998af..e7672c186325 100644
--- a/arch/powerpc/configs/85xx/ge_imp3a_defconfig
+++ b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
@@ -30,7 +30,7 @@ CONFIG_PREEMPT=y
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
CONFIG_MATH_EMULATION=y
-CONFIG_FORCE_MAX_ZONEORDER=17
+CONFIG_ARCH_FORCE_MAX_ORDER=17
CONFIG_PCI=y
CONFIG_PCIEPORTBUS=y
CONFIG_PCI_MSI=y
diff --git a/arch/powerpc/configs/fsl-emb-nonhw.config b/arch/powerpc/configs/fsl-emb-nonhw.config
index f14c6dbd7346..ab8a8c4530d9 100644
--- a/arch/powerpc/configs/fsl-emb-nonhw.config
+++ b/arch/powerpc/configs/fsl-emb-nonhw.config
@@ -41,7 +41,7 @@ CONFIG_FIXED_PHY=y
CONFIG_FONT_8x16=y
CONFIG_FONT_8x8=y
CONFIG_FONTS=y
-CONFIG_FORCE_MAX_ZONEORDER=13
+CONFIG_ARCH_FORCE_MAX_ORDER=13
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAME_WARN=1024
CONFIG_FTL=y
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 10e51ef9c79a..c299a18273ff 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -237,16 +237,6 @@ int pud_huge(pud_t pud)
return pud_large(pud);
}
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
-{
- if (flags & FOLL_GET)
- return NULL;
-
- return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-}
-
bool __init arch_hugetlb_valid_size(unsigned long size)
{
if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig
index e699e2e04128..b52e14ccb450 100644
--- a/arch/sh/configs/ecovec24_defconfig
+++ b/arch/sh/configs/ecovec24_defconfig
@@ -8,7 +8,7 @@ CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
CONFIG_CPU_SUBTYPE_SH7724=y
-CONFIG_FORCE_MAX_ZONEORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=12
CONFIG_MEMORY_SIZE=0x10000000
CONFIG_FLATMEM_MANUAL=y
CONFIG_SH_ECOVEC=y
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index ba569cfb4368..411fdc0901f7 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -18,7 +18,7 @@ config PAGE_OFFSET
default "0x80000000" if MMU
default "0x00000000"
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
range 9 64 if PAGE_SIZE_16KB
default "9" if PAGE_SIZE_16KB
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 1c852bb530ec..4d3d1af90d52 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -269,7 +269,7 @@ config ARCH_SPARSEMEM_ENABLE
config ARCH_SPARSEMEM_DEFAULT
def_bool y if SPARC64
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
default "13"
help
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 12ac277282ba..bcb0c5d2abc2 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -771,7 +771,7 @@ config HIGHMEM
If unsure, say Y.
-config FORCE_MAX_ZONEORDER
+config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
default "11"
help
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 7966a58af472..1ff0c858544f 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -111,6 +111,8 @@
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
+#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bc60c9cd3230..9aa0da991cfb 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -869,12 +869,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
}
}
-/* return true if the memory block is offlined, otherwise, return false */
-bool is_memblock_offlined(struct memory_block *mem)
-{
- return mem->state == MEM_OFFLINE;
-}
-
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
&dev_attr_probe.attr,
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 226ea76cc819..607f4634c27d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1410,9 +1410,19 @@ compress_again:
handle = zs_malloc(zram->mem_pool, comp_len,
GFP_NOIO | __GFP_HIGHMEM |
__GFP_MOVABLE);
- if (!IS_ERR((void *)handle))
+ if (IS_ERR((void *)handle))
+ return PTR_ERR((void *)handle);
+
+ if (comp_len != PAGE_SIZE)
goto compress_again;
- return PTR_ERR((void *)handle);
+ /*
+ * If the page is not compressible, you need to acquire the lock and
+ * execute the code below. The zcomp_stream_get() call is needed to
+ * disable the cpu hotplug and grab the zstrm buffer back.
+ * It is necessary that the dereferencing of the zstrm variable below
+ * occurs correctly.
+ */
+ zstrm = zcomp_stream_get(zram->comp);
}
alloced_pages = zs_get_total_pages(zram->mem_pool);
@@ -1710,9 +1720,6 @@ out:
static void zram_reset_device(struct zram *zram)
{
- struct zcomp *comp;
- u64 disksize;
-
down_write(&zram->init_lock);
zram->limit_pages = 0;
@@ -1722,17 +1729,15 @@ static void zram_reset_device(struct zram *zram)
return;
}
- comp = zram->comp;
- disksize = zram->disksize;
- zram->disksize = 0;
-
set_capacity_and_notify(zram->disk, 0);
part_stat_set_all(zram->disk->part0, 0);
/* I/O operation under all of CPU are done so let's free */
- zram_meta_free(zram, disksize);
+ zram_meta_free(zram, zram->disksize);
+ zram->disksize = 0;
memset(&zram->stats, 0, sizeof(zram->stats));
- zcomp_destroy(comp);
+ zcomp_destroy(zram->comp);
+ zram->comp = NULL;
reset_bdev(zram);
up_write(&zram->init_lock);
diff --git a/fs/Kconfig b/fs/Kconfig
index a547307c1ae8..2685a4d0d353 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config ARCH_SUPPORTS_HUGETLBFS
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+ depends on (SYSFS || SYSCTL)
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e84d22c5c6a8..05d228efcd31 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -8,6 +8,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/time.h>
@@ -222,8 +223,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
- struct page *pages[16];
- unsigned long nr_pages = end_index - index + 1;
+ struct folio_batch fbatch;
const int errno = blk_status_to_errno(cb->status);
int i;
int ret;
@@ -231,24 +231,23 @@ static noinline void end_compressed_writeback(struct inode *inode,
if (errno)
mapping_set_error(inode->i_mapping, errno);
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (ret == 0) {
- nr_pages -= 1;
- index += 1;
- continue;
- }
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ ret = filemap_get_folios(inode->i_mapping, &index, end_index,
+ &fbatch);
+
+ if (ret == 0)
+ return;
+
for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
if (errno)
- SetPageError(pages[i]);
- btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+ folio_set_error(folio);
+ btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
cb->start, cb->len);
- put_page(pages[i]);
}
- nr_pages -= ret;
- index += ret;
+ folio_batch_release(&fbatch);
}
/* the inode may be gone now */
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cf4f19e80e2f..013d6348deee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1900,9 +1900,8 @@ static int __process_pages_contig(struct address_space *mapping,
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
pgoff_t index = start_index;
- unsigned long nr_pages = end_index - start_index + 1;
unsigned long pages_processed = 0;
- struct page *pages[16];
+ struct folio_batch fbatch;
int err = 0;
int i;
@@ -1911,16 +1910,17 @@ static int __process_pages_contig(struct address_space *mapping,
ASSERT(processed_end && *processed_end == start);
}
- if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+ if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index)
mapping_set_error(mapping, -EIO);
- while (nr_pages > 0) {
- int found_pages;
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ int found_folios;
+
+ found_folios = filemap_get_folios_contig(mapping, &index,
+ end_index, &fbatch);
- found_pages = find_get_pages_contig(mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (found_pages == 0) {
+ if (found_folios == 0) {
/*
* Only if we're going to lock these pages, we can find
* nothing at @index.
@@ -1930,23 +1930,20 @@ static int __process_pages_contig(struct address_space *mapping,
goto out;
}
- for (i = 0; i < found_pages; i++) {
+ for (i = 0; i < found_folios; i++) {
int process_ret;
-
+ struct folio *folio = fbatch.folios[i];
process_ret = process_one_page(fs_info, mapping,
- pages[i], locked_page, page_ops,
+ &folio->page, locked_page, page_ops,
start, end);
if (process_ret < 0) {
- for (; i < found_pages; i++)
- put_page(pages[i]);
err = -EAGAIN;
+ folio_batch_release(&fbatch);
goto out;
}
- put_page(pages[i]);
- pages_processed++;
+ pages_processed += folio_nr_pages(folio);
}
- nr_pages -= found_pages;
- index += found_pages;
+ folio_batch_release(&fbatch);
cond_resched();
}
out:
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 6fc2b77ae5c3..9a176af847d7 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -337,7 +337,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
*
* Even with 0 returned, the page still need extra check to make sure
* it's really the correct page, as the caller is using
- * find_get_pages_contig(), which can race with page invalidating.
+ * filemap_get_folios_contig(), which can race with page invalidating.
*/
int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index a232b15b8021..26b0c99f54b8 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -4,6 +4,7 @@
*/
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sizes.h>
@@ -20,39 +21,40 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
unsigned long flags)
{
int ret;
- struct page *pages[16];
+ struct folio_batch fbatch;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- unsigned long nr_pages = end_index - index + 1;
int i;
int count = 0;
int loops = 0;
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long, nr_pages,
- ARRAY_SIZE(pages)), pages);
+ folio_batch_init(&fbatch);
+
+ while (index <= end_index) {
+ ret = filemap_get_folios_contig(inode->i_mapping, &index,
+ end_index, &fbatch);
for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
if (flags & PROCESS_TEST_LOCKED &&
- !PageLocked(pages[i]))
+ !folio_test_locked(folio))
count++;
- if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
- unlock_page(pages[i]);
- put_page(pages[i]);
+ if (flags & PROCESS_UNLOCK && folio_test_locked(folio))
+ folio_unlock(folio);
if (flags & PROCESS_RELEASE)
- put_page(pages[i]);
+ folio_put(folio);
}
- nr_pages -= ret;
- index += ret;
+ folio_batch_release(&fbatch);
cond_resched();
loops++;
if (loops > 100000) {
printk(KERN_ERR
- "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n",
- start, end, nr_pages, ret);
+ "stuck in a loop, start %llu, end %llu, ret %d\n",
+ start, end, ret);
break;
}
}
+
return count;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 55e762a58eb6..b4c9fff3ab6c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -152,7 +152,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
/*
* Default synchronous end-of-IO handler.. Just mark it up-to-date and
- * unlock the buffer. This is what ll_rw_block uses too.
+ * unlock the buffer.
*/
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
@@ -491,8 +491,8 @@ int inode_has_buffers(struct inode *inode)
* all already-submitted IO to complete, but does not queue any new
* writes to the disk.
*
- * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
- * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
+ * as you dirty the buffers, and then use osync_inode_buffers to wait for
* completion. Any other dirty buffers which are not yet queued for
* write will not be flushed to disk by the osync.
*/
@@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev,
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
- ll_rw_block(REQ_OP_WRITE, 1, &bh);
+ write_dirty_buffer(bh, 0);
put_bh(bh);
}
}
@@ -1342,23 +1342,12 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = __getblk(bdev, block, size);
if (likely(bh)) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
+ bh_readahead(bh, REQ_RAHEAD);
brelse(bh);
}
}
EXPORT_SYMBOL(__breadahead);
-void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
- gfp_t gfp)
-{
- struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
- if (likely(bh)) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
- brelse(bh);
- }
-}
-EXPORT_SYMBOL(__breadahead_gfp);
-
/**
* __bread_gfp() - reads a specified block and returns the bh
* @bdev: the block_device to read from
@@ -1817,7 +1806,7 @@ done:
/*
* The page was marked dirty, but the buffers were
* clean. Someone wrote them back by hand with
- * ll_rw_block/submit_bh. A rare case.
+ * write_dirty_buffer/submit_bh. A rare case.
*/
end_page_writeback(page);
@@ -2033,7 +2022,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
+ bh_read_nowait(bh, 0);
*wait_bh++=bh;
}
}
@@ -2593,11 +2582,9 @@ int block_truncate_page(struct address_space *mapping,
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
- err = -EIO;
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
+ err = bh_read(bh, 0);
/* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ if (err < 0)
goto unlock;
}
@@ -2726,61 +2713,6 @@ int submit_bh(blk_opf_t opf, struct buffer_head *bh)
}
EXPORT_SYMBOL(submit_bh);
-/**
- * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @opf: block layer request operation and flags.
- * @nr: number of &struct buffer_heads in the array
- * @bhs: array of pointers to &struct buffer_head
- *
- * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
- * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
- * @opf contains flags modifying the detailed I/O behavior, most notably
- * %REQ_RAHEAD.
- *
- * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit), any buffer that appears to be clean when doing a write
- * request, and any buffer that appears to be up-to-date when doing read
- * request. Further it marks as clean buffers that are processed for
- * writing (the buffer cache won't assume that they are actually clean
- * until the buffer gets unlocked).
- *
- * ll_rw_block sets b_end_io to simple completion handler that marks
- * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
- * any waiters.
- *
- * All of the buffers must be for the same device, and must also be a
- * multiple of the current approved size for the device.
- */
-void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[])
-{
- const enum req_op op = opf & REQ_OP_MASK;
- int i;
-
- for (i = 0; i < nr; i++) {
- struct buffer_head *bh = bhs[i];
-
- if (!trylock_buffer(bh))
- continue;
- if (op == REQ_OP_WRITE) {
- if (test_clear_buffer_dirty(bh)) {
- bh->b_end_io = end_buffer_write_sync;
- get_bh(bh);
- submit_bh(opf, bh);
- continue;
- }
- } else {
- if (!buffer_uptodate(bh)) {
- bh->b_end_io = end_buffer_read_sync;
- get_bh(bh);
- submit_bh(opf, bh);
- continue;
- }
- }
- unlock_buffer(bh);
- }
-}
-EXPORT_SYMBOL(ll_rw_block);
-
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
lock_buffer(bh);
@@ -3029,29 +2961,69 @@ int bh_uptodate_or_lock(struct buffer_head *bh)
EXPORT_SYMBOL(bh_uptodate_or_lock);
/**
- * bh_submit_read - Submit a locked buffer for reading
+ * __bh_read - Submit read for a locked buffer
* @bh: struct buffer_head
+ * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
+ * @wait: wait until reading finish
*
- * Returns zero on success and -EIO on error.
+ * Returns zero on success or don't wait, and -EIO on error.
*/
-int bh_submit_read(struct buffer_head *bh)
+int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
- BUG_ON(!buffer_locked(bh));
+ int ret = 0;
- if (buffer_uptodate(bh)) {
- unlock_buffer(bh);
- return 0;
- }
+ BUG_ON(!buffer_locked(bh));
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return 0;
- return -EIO;
+ submit_bh(REQ_OP_READ | op_flags, bh);
+ if (wait) {
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ ret = -EIO;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__bh_read);
+
+/**
+ * __bh_read_batch - Submit read for a batch of unlocked buffers
+ * @nr: entry number of the buffer batch
+ * @bhs: a batch of struct buffer_head
+ * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
+ * @force_lock: force to get a lock on the buffer if set, otherwise drops any
+ * buffer that cannot lock.
+ *
+ * Returns zero on success or don't wait, and -EIO on error.
+ */
+void __bh_read_batch(int nr, struct buffer_head *bhs[],
+ blk_opf_t op_flags, bool force_lock)
+{
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ struct buffer_head *bh = bhs[i];
+
+ if (buffer_uptodate(bh))
+ continue;
+
+ if (force_lock)
+ lock_buffer(bh);
+ else
+ if (!trylock_buffer(bh))
+ continue;
+
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+
+ bh->b_end_io = end_buffer_read_sync;
+ get_bh(bh);
+ submit_bh(REQ_OP_READ | op_flags, bh);
+ }
}
-EXPORT_SYMBOL(bh_submit_read);
+EXPORT_SYMBOL(__bh_read_batch);
void __init buffer_init(void)
{
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index c17ccc19b938..5dc0a31f4a08 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -126,6 +126,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
struct ext2_group_desc * desc;
struct buffer_head * bh = NULL;
ext2_fsblk_t bitmap_blk;
+ int ret;
desc = ext2_get_group_desc(sb, block_group, NULL);
if (!desc)
@@ -139,10 +140,10 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
block_group, le32_to_cpu(desc->bg_block_bitmap));
return NULL;
}
- if (likely(bh_uptodate_or_lock(bh)))
+ ret = bh_read(bh, 0);
+ if (ret > 0)
return bh;
-
- if (bh_submit_read(bh) < 0) {
+ if (ret < 0) {
brelse(bh);
ext2_error(sb, __func__,
"Cannot read block bitmap - "
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 7e70e0ba5a6c..6ed728aae9a5 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -525,8 +525,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
- if (!buffer_locked(first_bh))
- ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh);
+ bh_read_nowait(first_bh, REQ_META | REQ_PRIO);
dblock++;
extlen--;
@@ -534,9 +533,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
while (extlen) {
bh = gfs2_getbuf(gl, dblock, CREATE);
- if (!buffer_uptodate(bh) && !buffer_locked(bh))
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META |
- REQ_PRIO, 1, &bh);
+ bh_readahead(bh, REQ_RAHEAD | REQ_META | REQ_PRIO);
brelse(bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index f201eaf59d0d..1ed17226d9ed 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -745,12 +745,8 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
}
if (PageUptodate(page))
set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- goto unlock_out;
- }
+ if (bh_read(bh, REQ_META | REQ_PRIO) < 0)
+ goto unlock_out;
if (gfs2_is_jdata(ip))
gfs2_trans_add_data(ip->i_gl, bh);
else
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index b466172eec25..59b03d74ecbe 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -82,7 +82,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
return 0;
}
haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
- ll_rw_block(REQ_OP_READ, haveblocks, bhs);
+ bh_read_batch(haveblocks, bhs);
curbh = 0;
curpage = 0;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 6350d3857c89..140b070471c0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1893,19 +1893,16 @@ static int journal_get_superblock(journal_t *journal)
{
struct buffer_head *bh;
journal_superblock_t *sb;
- int err = -EIO;
+ int err;
bh = journal->j_sb_buffer;
J_ASSERT(bh != NULL);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- printk(KERN_ERR
- "JBD2: IO error reading journal superblock\n");
- goto out;
- }
+ err = bh_read(bh, 0);
+ if (err < 0) {
+ printk(KERN_ERR
+ "JBD2: IO error reading journal superblock\n");
+ goto out;
}
if (buffer_verified(bh))
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index f548479615c6..1f878c315b03 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
- ll_rw_block(REQ_OP_READ, nbufs, bufs);
+ bh_readahead_batch(nbufs, bufs, 0);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
@@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
if (nbufs)
- ll_rw_block(REQ_OP_READ, nbufs, bufs);
+ bh_readahead_batch(nbufs, bufs, 0);
err = 0;
failed:
@@ -152,9 +152,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
return -ENOMEM;
if (!buffer_uptodate(bh)) {
- /* If this is a brand new buffer, start readahead.
- Otherwise, we assume we are already reading it. */
- if (!buffer_req(bh))
+ /*
+ * If this is a brand new buffer, start readahead.
+ * Otherwise, we assume we are already reading it.
+ */
+ bool need_readahead = !buffer_req(bh);
+
+ bh_read_nowait(bh, 0);
+ if (need_readahead)
do_readahead(journal, offset);
wait_on_buffer(bh);
}
@@ -687,7 +692,6 @@ static int do_one_pass(journal_t *journal,
mark_buffer_dirty(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
- /* ll_rw_block(WRITE, 1, &nbh); */
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3267e96c256c..39b7eea2642a 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -480,41 +480,36 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
sector_t *blkoff)
{
- unsigned int i;
+ unsigned int i, nr_folios;
pgoff_t index;
- unsigned int nblocks_in_page;
unsigned long length = 0;
- sector_t b;
- struct pagevec pvec;
- struct page *page;
+ struct folio_batch fbatch;
+ struct folio *folio;
if (inode->i_mapping->nrpages == 0)
return 0;
index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
- nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
repeat:
- pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
- pvec.pages);
- if (pvec.nr == 0)
+ nr_folios = filemap_get_folios_contig(inode->i_mapping, &index, ULONG_MAX,
+ &fbatch);
+ if (nr_folios == 0)
return length;
- if (length > 0 && pvec.pages[0]->index > index)
- goto out;
-
- b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits);
i = 0;
do {
- page = pvec.pages[i];
+ folio = fbatch.folios[i];
- lock_page(page);
- if (page_has_buffers(page)) {
+ folio_lock(folio);
+ if (folio_buffers(folio)) {
struct buffer_head *bh, *head;
+ sector_t b;
- bh = head = page_buffers(page);
+ b = folio->index << (PAGE_SHIFT - inode->i_blkbits);
+ bh = head = folio_buffers(folio);
do {
if (b < start_blk)
continue;
@@ -529,21 +524,17 @@ repeat:
} else {
if (length > 0)
goto out_locked;
-
- b += nblocks_in_page;
}
- unlock_page(page);
+ folio_unlock(folio);
- } while (++i < pagevec_count(&pvec));
+ } while (++i < nr_folios);
- index = page->index + 1;
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
goto repeat;
out_locked:
- unlock_page(page);
-out:
- pagevec_release(&pvec);
+ folio_unlock(folio);
+ folio_batch_release(&fbatch);
return length;
}
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 51363d4e8636..cadbfa111539 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -630,12 +630,9 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
bh->b_size = block_size;
off = vbo & (PAGE_SIZE - 1);
set_bh_page(bh, page, off);
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
+ err = bh_read(bh, 0);
+ if (err < 0)
goto out;
- }
zero_user_segment(page, off + voff, off + block_size);
}
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index af4157f61927..1d65f6ef00ca 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -636,7 +636,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
!buffer_new(bh) &&
ocfs2_should_read_blk(inode, page, block_start) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
+ bh_read_nowait(bh, 0);
*wait_bh++=bh;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e2cc9eec287c..26b4c2bfee49 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1764,9 +1764,7 @@ static int ocfs2_get_sector(struct super_block *sb,
if (!buffer_dirty(*bh))
clear_buffer_uptodate(*bh);
unlock_buffer(*bh);
- ll_rw_block(REQ_OP_READ, 1, bh);
- wait_on_buffer(*bh);
- if (!buffer_uptodate(*bh)) {
+ if (bh_read(*bh, 0) < 0) {
mlog_errno(-EIO);
brelse(*bh);
*bh = NULL;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4e0023643f8b..482f91577f8c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -864,7 +864,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
- hugepage_vma_check(vma, vma->vm_flags, true, false));
+ hugepage_vma_check(vma, vma->vm_flags, true, false, true));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ba3525ccc27e..cb240eac5036 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -203,9 +203,9 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- unsigned long maxpages, lpages, nr, loop, ret;
+ unsigned long maxpages, lpages, nr_folios, loop, ret, nr_pages, pfn;
struct inode *inode = file_inode(file);
- struct page **pages = NULL, **ptr, *page;
+ struct folio_batch fbatch;
loff_t isize;
/* the mapping mustn't extend beyond the EOF */
@@ -221,31 +221,39 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
goto out;
/* gang-find the pages */
- pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- goto out_free;
-
- nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages);
- if (nr != lpages)
- goto out_free_pages; /* leave if some pages were missing */
+ folio_batch_init(&fbatch);
+ nr_pages = 0;
+repeat:
+ nr_folios = filemap_get_folios_contig(inode->i_mapping, &pgoff,
+ ULONG_MAX, &fbatch);
+ if (!nr_folios) {
+ ret = -ENOSYS;
+ return ret;
+ }
+ if (ret == -ENOSYS) {
+ ret = (unsigned long) folio_address(fbatch.folios[0]);
+ pfn = folio_pfn(fbatch.folios[0]);
+ }
/* check the pages for physical adjacency */
- ptr = pages;
- page = *ptr++;
- page++;
- for (loop = lpages; loop > 1; loop--)
- if (*ptr++ != page++)
- goto out_free_pages;
+ for (loop = 0; loop < nr_folios; loop++) {
+ if (pfn + nr_pages != folio_pfn(fbatch.folios[loop])) {
+ ret = -ENOSYS;
+ goto out_free; /* leave if not physical adjacent */
+ }
+ nr_pages += folio_nr_pages(fbatch.folios[loop]);
+ if (nr_pages >= lpages)
+ goto out_free; /* successfully found desired pages*/
+ }
+ if (nr_pages < lpages) {
+ folio_batch_release(&fbatch);
+ goto repeat; /* loop if pages are missing */
+ }
/* okay - all conditions fulfilled */
- ret = (unsigned long) page_address(pages[0]);
-out_free_pages:
- ptr = pages;
- for (loop = nr; loop > 0; loop--)
- put_page(*ptr++);
out_free:
- kfree(pages);
+ folio_batch_release(&fbatch);
out:
return ret;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 94addfcefede..9f62da7471c9 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -868,7 +868,7 @@ loop_next:
*/
if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
spin_unlock(lock);
- ll_rw_block(REQ_OP_WRITE, 1, &bh);
+ write_dirty_buffer(bh, 0);
spin_lock(lock);
}
put_bh(bh);
@@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s,
if (tbh) {
if (buffer_dirty(tbh)) {
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(REQ_OP_WRITE, 1, &tbh);
+ write_dirty_buffer(tbh, 0);
reiserfs_write_lock_nested(s, depth);
}
put_bh(tbh) ;
@@ -2240,7 +2240,7 @@ abort_replay:
}
}
/* read in the log blocks, memcpy to the corresponding real block */
- ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks);
+ bh_read_batch(get_desc_trans_len(desc), log_blocks);
for (i = 0; i < get_desc_trans_len(desc); i++) {
wait_on_buffer(log_blocks[i]);
@@ -2342,10 +2342,11 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
} else
bhlist[j++] = bh;
}
- ll_rw_block(REQ_OP_READ, j, bhlist);
+ bh = bhlist[0];
+ bh_read_nowait(bh, 0);
+ bh_readahead_batch(j - 1, &bhlist[1], 0);
for (i = 1; i < j; i++)
brelse(bhlist[i]);
- bh = bhlist[0];
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 9a293609a022..84c12a1947b2 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s,
if (!buffer_uptodate(bh[j])) {
if (depth == -1)
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j);
+ bh_readahead(bh[j], REQ_RAHEAD);
}
brelse(bh[j]);
}
@@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
if (!buffer_uptodate(bh) && depth == -1)
depth = reiserfs_write_unlock_nested(sb);
- ll_rw_block(REQ_OP_READ, 1, &bh);
+ bh_read_nowait(bh, 0);
wait_on_buffer(bh);
if (depth != -1)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c88cd2ce0665..a5ffec0c7517 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1702,9 +1702,7 @@ static int read_super_block(struct super_block *s, int offset)
/* after journal replay, reread all bitmap and super blocks */
static int reread_meta_blocks(struct super_block *s)
{
- ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s));
- wait_on_buffer(SB_BUFFER_WITH_SB(s));
- if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
+ if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) {
reiserfs_warning(s, "reiserfs-2504", "error reading the super");
return 1;
}
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index cad3772f9dbe..be640f4b2f2c 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
brelse(tmp);
}
if (num) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
+ bh_readahead_batch(num, bha, REQ_RAHEAD);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index a2adf6293093..16bcf2c6b8b3 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
brelse(tmp);
}
if (num) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
+ bh_readahead_batch(num, bha, REQ_RAHEAD);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8d06daed549f..dce6ae9ae306 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1211,13 +1211,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block,
if (!bh)
return NULL;
- if (buffer_uptodate(bh))
- return bh;
-
- ll_rw_block(REQ_OP_READ, 1, &bh);
-
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
+ if (bh_read(bh, 0) >= 0)
return bh;
brelse(bh);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index bd810d8239f2..2436e3f82147 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -295,14 +295,10 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
if (!buffer_mapped(bh))
map_bh(bh, inode->i_sb, oldb + pos);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- ufs_error(inode->i_sb, __func__,
- "read of block failed\n");
- break;
- }
+ if (bh_read(bh, 0) < 0) {
+ ufs_error(inode->i_sb, __func__,
+ "read of block failed\n");
+ break;
}
UFSD(" change from %llu to %llu, pos %u\n",
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 175de70e3adf..4de91ba9e85e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/swapops.h>
+#include <linux/miscdevice.h>
int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
- if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
- ctx->flags & UFFD_USER_MODE_ONLY) {
- printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
- "sysctl knob to 1 if kernel faults must be handled "
- "without obtaining CAP_SYS_PTRACE capability\n");
+ if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
goto out;
- }
/*
* If it's already released don't get it. This avoids to loop
@@ -2056,20 +2052,11 @@ static void init_once_userfaultfd_ctx(void *mem)
seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}
-SYSCALL_DEFINE1(userfaultfd, int, flags)
+static int new_userfaultfd(int flags)
{
struct userfaultfd_ctx *ctx;
int fd;
- if (!sysctl_unprivileged_userfaultfd &&
- (flags & UFFD_USER_MODE_ONLY) == 0 &&
- !capable(CAP_SYS_PTRACE)) {
- printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
- "sysctl knob to 1 if kernel faults must be handled "
- "without obtaining CAP_SYS_PTRACE capability\n");
- return -EPERM;
- }
-
BUG_ON(!current->mm);
/* Check the UFFD_* constants for consistency. */
@@ -2102,8 +2089,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
return fd;
}
+static inline bool userfaultfd_syscall_allowed(int flags)
+{
+ /* Userspace-only page faults are always allowed */
+ if (flags & UFFD_USER_MODE_ONLY)
+ return true;
+
+ /*
+ * The user is requesting a userfaultfd which can handle kernel faults.
+ * Privileged users are always allowed to do this.
+ */
+ if (capable(CAP_SYS_PTRACE))
+ return true;
+
+ /* Otherwise, access to kernel fault handling is sysctl controlled. */
+ return sysctl_unprivileged_userfaultfd;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ if (!userfaultfd_syscall_allowed(flags))
+ return -EPERM;
+
+ return new_userfaultfd(flags);
+}
+
+static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
+{
+ if (cmd != USERFAULTFD_IOC_NEW)
+ return -EINVAL;
+
+ return new_userfaultfd(flags);
+}
+
+static const struct file_operations userfaultfd_dev_fops = {
+ .unlocked_ioctl = userfaultfd_dev_ioctl,
+ .compat_ioctl = userfaultfd_dev_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice userfaultfd_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "userfaultfd",
+ .fops = &userfaultfd_dev_fops
+};
+
static int __init userfaultfd_init(void)
{
+ int ret;
+
+ ret = misc_register(&userfaultfd_misc);
+ if (ret)
+ return ret;
+
userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
sizeof(struct userfaultfd_ctx),
0,
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 089c9ade4325..9b6556d3f110 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -214,8 +214,6 @@ struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
-void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
- gfp_t gfp);
struct buffer_head *__bread_gfp(struct block_device *,
sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
@@ -225,7 +223,6 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
void __lock_buffer(struct buffer_head *bh);
-void ll_rw_block(blk_opf_t, int, struct buffer_head * bh[]);
int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
@@ -233,7 +230,9 @@ int submit_bh(blk_opf_t, struct buffer_head *);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
-int bh_submit_read(struct buffer_head *bh);
+int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
+void __bh_read_batch(int nr, struct buffer_head *bhs[],
+ blk_opf_t op_flags, bool force_lock);
extern int buffer_heads_over_limit;
@@ -340,12 +339,6 @@ sb_breadahead(struct super_block *sb, sector_t block)
__breadahead(sb->s_bdev, block, sb->s_blocksize);
}
-static inline void
-sb_breadahead_unmovable(struct super_block *sb, sector_t block)
-{
- __breadahead_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
-}
-
static inline struct buffer_head *
sb_getblk(struct super_block *sb, sector_t block)
{
@@ -407,6 +400,41 @@ static inline struct buffer_head *__getblk(struct block_device *bdev,
return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
}
+static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
+{
+ if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
+ if (!buffer_uptodate(bh))
+ __bh_read(bh, op_flags, false);
+ else
+ unlock_buffer(bh);
+ }
+}
+
+static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags)
+{
+ if (!bh_uptodate_or_lock(bh))
+ __bh_read(bh, op_flags, false);
+}
+
+/* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */
+static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags)
+{
+ if (bh_uptodate_or_lock(bh))
+ return 1;
+ return __bh_read(bh, op_flags, true);
+}
+
+static inline void bh_read_batch(int nr, struct buffer_head *bhs[])
+{
+ __bh_read_batch(nr, bhs, 0, true);
+}
+
+static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[],
+ blk_opf_t op_flags)
+{
+ __bh_read_batch(nr, bhs, op_flags, false);
+}
+
/**
* __bread() - reads a specified block and returns the bh
* @bdev: the block_device to read from
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f314be58fa77..ea6cb9399152 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -18,6 +18,9 @@ static inline int gfp_migratetype(const gfp_t gfp_flags)
VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
+ BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE);
+ BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >>
+ GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC);
if (unlikely(page_group_by_mobility_disabled))
return MIGRATE_UNMOVABLE;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 768e5261fdae..38265f9f782e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
!inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}
-bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf);
+bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
+ bool smaps, bool in_pf, bool enforce_sysfs);
#define transparent_hugepage_use_zero_page() \
(transparent_hugepage_flags & \
@@ -219,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
int advice);
+int madvise_collapse(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
unsigned long end, long adjust_next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
@@ -321,8 +323,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
}
static inline bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf)
+ unsigned long vm_flags, bool smaps,
+ bool in_pf, bool enforce_sysfs)
{
return false;
}
@@ -362,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma,
static inline int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
- BUG();
- return 0;
+ return -EINVAL;
}
+
+static inline int madvise_collapse(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ return -EINVAL;
+}
+
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3ec981a0d8b3..57e72954a482 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1123,14 +1123,10 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
-extern void __init hugetlb_cma_check(void);
#else
static inline __init void hugetlb_cma_reserve(int order)
{
}
-static inline __init void hugetlb_cma_check(void)
-{
-}
#endif
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 379344828e78..630cd255d0cf 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -90,32 +90,31 @@ hugetlb_cgroup_from_page_rsvd(struct page *page)
return __hugetlb_cgroup_from_page(page, true);
}
-static inline int __set_hugetlb_cgroup(struct page *page,
+static inline void __set_hugetlb_cgroup(struct page *page,
struct hugetlb_cgroup *h_cg, bool rsvd)
{
VM_BUG_ON_PAGE(!PageHuge(page), page);
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
- return -1;
+ return;
if (rsvd)
set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD,
(unsigned long)h_cg);
else
set_page_private(page + SUBPAGE_INDEX_CGROUP,
(unsigned long)h_cg);
- return 0;
}
-static inline int set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return __set_hugetlb_cgroup(page, h_cg, false);
+ __set_hugetlb_cgroup(page, h_cg, false);
}
-static inline int set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return __set_hugetlb_cgroup(page, h_cg, true);
+ __set_hugetlb_cgroup(page, h_cg, true);
}
static inline bool hugetlb_cgroup_disabled(void)
@@ -199,16 +198,14 @@ hugetlb_cgroup_from_page_rsvd(struct page *page)
return NULL;
}
-static inline int set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return 0;
}
-static inline int set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return 0;
}
static inline bool hugetlb_cgroup_disabled(void)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6257867fbf95..a2461f9a8738 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -354,10 +354,11 @@ struct mem_cgroup {
};
/*
- * size of first charge trial. "32" comes from vmscan.c's magic value.
- * TODO: maybe necessary to use big numbers in big irons.
+ * size of first charge trial.
+ * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
+ * workload.
*/
-#define MEMCG_CHARGE_BATCH 32U
+#define MEMCG_CHARGE_BATCH 64U
extern struct mem_cgroup *root_mem_cgroup;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e0b2209ab71c..51052969dbfe 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -11,7 +11,6 @@ struct page;
struct zone;
struct pglist_data;
struct mem_section;
-struct memory_block;
struct memory_group;
struct resource;
struct vmem_altmap;
@@ -216,6 +215,22 @@ void put_online_mems(void);
void mem_hotplug_begin(void);
void mem_hotplug_done(void);
+/* See kswapd_is_running() */
+static inline void pgdat_kswapd_lock(pg_data_t *pgdat)
+{
+ mutex_lock(&pgdat->kswapd_lock);
+}
+
+static inline void pgdat_kswapd_unlock(pg_data_t *pgdat)
+{
+ mutex_unlock(&pgdat->kswapd_lock);
+}
+
+static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat)
+{
+ mutex_init(&pgdat->kswapd_lock);
+}
+
#else /* ! CONFIG_MEMORY_HOTPLUG */
#define pfn_to_online_page(pfn) \
({ \
@@ -252,6 +267,10 @@ static inline bool movable_node_is_enabled(void)
{
return false;
}
+
+static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
+static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
+static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
#endif /* ! CONFIG_MEMORY_HOTPLUG */
/*
@@ -333,7 +352,6 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
extern void remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages);
-extern bool is_memblock_offlined(struct memory_block *mem);
extern int sparse_add_section(int nid, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 668389b4b53d..d232de7cdc56 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -151,13 +151,6 @@ extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask);
extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
-static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
-{
- struct mempolicy *mpol = get_task_policy(current);
-
- return policy_nodemask(gfp, mpol);
-}
-
extern unsigned int mempolicy_slab_node(void);
extern enum zone_type policy_zone;
@@ -189,6 +182,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
return (pol->mode == MPOL_PREFERRED_MANY);
}
+extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
#else
@@ -294,11 +288,6 @@ static inline void mpol_put_task_policy(struct task_struct *task)
{
}
-static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
-{
- return NULL;
-}
-
static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
return false;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21f8b27bd9fd..8a5ad9d050bf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1255,6 +1255,18 @@ static inline int folio_nid(const struct folio *folio)
}
#ifdef CONFIG_NUMA_BALANCING
+/* page access time bits needs to hold at least 4 seconds */
+#define PAGE_ACCESS_TIME_MIN_BITS 12
+#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
+#define PAGE_ACCESS_TIME_BUCKETS \
+ (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
+#else
+#define PAGE_ACCESS_TIME_BUCKETS 0
+#endif
+
+#define PAGE_ACCESS_TIME_MASK \
+ (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)
+
static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
@@ -1318,12 +1330,25 @@ static inline void page_cpupid_reset_last(struct page *page)
page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
+
+static inline int xchg_page_access_time(struct page *page, int time)
+{
+ int last_time;
+
+ last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+ return last_time << PAGE_ACCESS_TIME_BUCKETS;
+}
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
return page_to_nid(page); /* XXX */
}
+static inline int xchg_page_access_time(struct page *page, int time)
+{
+ return 0;
+}
+
static inline int page_cpupid_last(struct page *page)
{
return page_to_nid(page); /* XXX */
@@ -2495,7 +2520,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn);
-extern unsigned long find_min_pfn_with_active_regions(void);
#ifndef CONFIG_NUMA
static inline int early_pfn_to_nid(unsigned long pfn)
@@ -2975,8 +2999,8 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
* PageAnonExclusive() has to protect against concurrent GUP:
* * Ordinary GUP: Using the PT lock
* * GUP-fast and fork(): mm->write_protect_seq
- * * GUP-fast and KSM or temporary unmapping (swap, migration):
- * clear/invalidate+flush of the page table entry
+ * * GUP-fast and KSM or temporary unmapping (swap, migration): see
+ * page_try_share_anon_rmap()
*
* Must be called with the (sub)page that's actually referenced via the
* page table entry, which might not necessarily be the head page for a
@@ -2997,6 +3021,11 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page)
*/
if (!PageAnon(page))
return false;
+
+ /* Paired with a memory barrier in page_try_share_anon_rmap(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_rmb();
+
/*
* Note that PageKsm() pages cannot be exclusive, and consequently,
* cannot get pinned.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e24b40c52468..18cf0fc5ce67 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -24,10 +24,10 @@
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
-#ifndef CONFIG_FORCE_MAX_ZONEORDER
+#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
#define MAX_ORDER 11
#else
-#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
+#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
@@ -221,6 +221,7 @@ enum node_stat_item {
#endif
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS, /* promote successfully */
+ PGPROMOTE_CANDIDATE, /* candidate pages to promote */
#endif
NR_VM_NODE_STAT_ITEMS
};
@@ -306,6 +307,8 @@ static inline bool is_active_lru(enum lru_list lru)
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}
+#define WORKINGSET_ANON 0
+#define WORKINGSET_FILE 1
#define ANON_AND_FILE 2
enum lruvec_flags {
@@ -953,8 +956,10 @@ typedef struct pglist_data {
atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
unsigned long nr_reclaim_start; /* nr pages written while throttled
* when throttling started. */
- struct task_struct *kswapd; /* Protected by
- mem_hotplug_begin/done() */
+#ifdef CONFIG_MEMORY_HOTPLUG
+ struct mutex kswapd_lock;
+#endif
+ struct task_struct *kswapd; /* Protected by kswapd_lock */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
@@ -996,6 +1001,21 @@ typedef struct pglist_data {
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ /* start time in ms of current promote rate limit period */
+ unsigned int nbp_rl_start;
+ /* number of promote candidate pages at start time of current rate limit period */
+ unsigned long nbp_rl_nr_cand;
+ /* promote threshold in ms */
+ unsigned int nbp_threshold;
+ /* start time in ms of current promote threshold adjustment period */
+ unsigned int nbp_th_start;
+ /*
+ * number of promote candidate pages at stat time of current promote
+ * threshold adjustment period
+ */
+ unsigned long nbp_th_nr_cand;
+#endif
/* Fields commonly accessed by the page reclaim scanner */
/*
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 679591301994..78a1c934e416 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -3,15 +3,26 @@
#define _LINUX_PAGE_COUNTER_H
#include <linux/atomic.h>
+#include <linux/cache.h>
#include <linux/kernel.h>
#include <asm/page.h>
+#if defined(CONFIG_SMP)
+struct pc_padding {
+ char x[0];
+} ____cacheline_internodealigned_in_smp;
+#define PC_PADDING(name) struct pc_padding name
+#else
+#define PC_PADDING(name)
+#endif
+
struct page_counter {
+ /*
+ * Make sure 'usage' does not share cacheline with any other field. The
+ * memcg->memory.usage is a hot member of struct mem_cgroup.
+ */
atomic_long_t usage;
- unsigned long min;
- unsigned long low;
- unsigned long high;
- unsigned long max;
+ PC_PADDING(_pad1_);
/* effective memory.min and memory.min usage tracking */
unsigned long emin;
@@ -23,18 +34,18 @@ struct page_counter {
atomic_long_t low_usage;
atomic_long_t children_low_usage;
- /* legacy */
unsigned long watermark;
unsigned long failcnt;
- /*
- * 'parent' is placed here to be far from 'usage' to reduce
- * cache false sharing, as 'usage' is written mostly while
- * parent is frequently read for cgroup's hierarchical
- * counting nature.
- */
+ /* Keep all the read most fields in a separete cacheline. */
+ PC_PADDING(_pad2_);
+
+ unsigned long min;
+ unsigned long low;
+ unsigned long high;
+ unsigned long max;
struct page_counter *parent;
-};
+} ____cacheline_internodealigned_in_smp;
#if BITS_PER_LONG == 32
#define PAGE_COUNTER_MAX LONG_MAX
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index fabb2e1e087f..22be4582faae 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -36,9 +36,15 @@ struct page_ext {
unsigned long flags;
};
+extern bool early_page_ext;
extern unsigned long page_ext_size;
extern void pgdat_page_ext_init(struct pglist_data *pgdat);
+static inline bool early_page_ext_enabled(void)
+{
+ return early_page_ext;
+}
+
#ifdef CONFIG_SPARSEMEM
static inline void page_ext_init_flatmem(void)
{
@@ -55,7 +61,8 @@ static inline void page_ext_init(void)
}
#endif
-struct page_ext *lookup_page_ext(const struct page *page);
+extern struct page_ext *page_ext_get(struct page *page);
+extern void page_ext_put(struct page_ext *page_ext);
static inline struct page_ext *page_ext_next(struct page_ext *curr)
{
@@ -67,13 +74,13 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr)
#else /* !CONFIG_PAGE_EXTENSION */
struct page_ext;
-static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
+static inline bool early_page_ext_enabled(void)
{
+ return false;
}
-static inline struct page_ext *lookup_page_ext(const struct page *page)
+static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
{
- return NULL;
}
static inline void page_ext_init(void)
@@ -87,5 +94,14 @@ static inline void page_ext_init_flatmem_late(void)
static inline void page_ext_init_flatmem(void)
{
}
+
+static inline struct page_ext *page_ext_get(struct page *page)
+{
+ return NULL;
+}
+
+static inline void page_ext_put(struct page_ext *page_ext)
+{
+}
#endif /* CONFIG_PAGE_EXTENSION */
#endif /* __LINUX_PAGE_EXT_H */
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 4663dfed1293..5cb7bd2078ec 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -13,65 +13,79 @@
* If there is not enough space to store Idle and Young bits in page flags, use
* page ext flags instead.
*/
-
static inline bool folio_test_young(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
+ bool page_young;
if (unlikely(!page_ext))
return false;
- return test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_ext_put(page_ext);
+
+ return page_young;
}
static inline void folio_set_young(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
if (unlikely(!page_ext))
return;
set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_ext_put(page_ext);
}
static inline bool folio_test_clear_young(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
+ bool page_young;
if (unlikely(!page_ext))
return false;
- return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_ext_put(page_ext);
+
+ return page_young;
}
static inline bool folio_test_idle(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
+ bool page_idle;
if (unlikely(!page_ext))
return false;
- return test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_ext_put(page_ext);
+
+ return page_idle;
}
static inline void folio_set_idle(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
if (unlikely(!page_ext))
return;
set_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_ext_put(page_ext);
}
static inline void folio_clear_idle(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
if (unlikely(!page_ext))
return;
clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_ext_put(page_ext);
}
#endif /* !CONFIG_64BIT */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0178b2040ea3..09de43e36a64 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -718,8 +718,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch);
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages);
+unsigned filemap_get_folios_contig(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
struct page **pages);
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index ac7b38ad5903..f3fafb731ffd 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -15,12 +15,12 @@ struct mm_walk;
* this handler is required to be able to handle
* pmd_trans_huge() pmds. They may simply choose to
* split_huge_page() instead of handling it explicitly.
- * @pte_entry: if set, called for each non-empty PTE (lowest-level)
- * entry
+ * @pte_entry: if set, called for each PTE (lowest-level) entry,
+ * including empty ones
* @pte_hole: if set, called for each hole at all levels,
- * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD
- * 4:PTE. Any folded depths (where PTRS_PER_P?D is equal
- * to 1) are skipped.
+ * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
+ * Any folded depths (where PTRS_PER_P?D is equal to 1)
+ * are skipped.
* @hugetlb_entry: if set, called for each hugetlb entry
* @test_walk: caller specific callback function to determine whether
* we walk over the current vma or not. Returning 0 means
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 014ee8f0fbaa..d13b4f7cc5be 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1276,8 +1276,7 @@ static inline int pgd_devmap(pgd_t pgd)
#endif
#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
- (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
- !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline int pud_trans_huge(pud_t pud)
{
return 0;
@@ -1598,11 +1597,7 @@ typedef unsigned int pgtbl_mod_mask;
#endif
#ifndef has_transparent_hugepage
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define has_transparent_hugepage() 1
-#else
-#define has_transparent_hugepage() 0
-#endif
+#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif
/*
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf80adca980b..72b2bcc37f73 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -267,7 +267,7 @@ dup:
* @page: the exclusive anonymous page to try marking possibly shared
*
* The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated+flushed, to properly sync against GUP-fast.
+ * cleared/invalidated.
*
* This is similar to page_try_dup_anon_rmap(), however, not used during fork()
* to duplicate a mapping, but instead to prepare for KSM or temporarily
@@ -283,12 +283,68 @@ static inline int page_try_share_anon_rmap(struct page *page)
{
VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
- /* See page_try_dup_anon_rmap(). */
- if (likely(!is_device_private_page(page) &&
- unlikely(page_maybe_dma_pinned(page))))
- return -EBUSY;
+ /* device private pages cannot get pinned via GUP. */
+ if (unlikely(is_device_private_page(page))) {
+ ClearPageAnonExclusive(page);
+ return 0;
+ }
+ /*
+ * We have to make sure that when we clear PageAnonExclusive, that
+ * the page is not pinned and that concurrent GUP-fast won't succeed in
+ * concurrently pinning the page.
+ *
+ * Conceptually, PageAnonExclusive clearing consists of:
+ * (A1) Clear PTE
+ * (A2) Check if the page is pinned; back off if so.
+ * (A3) Clear PageAnonExclusive
+ * (A4) Restore PTE (optional, but certainly not writable)
+ *
+ * When clearing PageAnonExclusive, we cannot possibly map the page
+ * writable again, because anon pages that may be shared must never
+ * be writable. So in any case, if the PTE was writable it cannot
+ * be writable anymore afterwards and there would be a PTE change. Only
+ * if the PTE wasn't writable, there might not be a PTE change.
+ *
+ * Conceptually, GUP-fast pinning of an anon page consists of:
+ * (B1) Read the PTE
+ * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
+ * (B3) Pin the mapped page
+ * (B4) Check if the PTE changed by re-reading it; back off if so.
+ * (B5) If the original PTE is not writable, check if
+ * PageAnonExclusive is not set; back off if so.
+ *
+ * If the PTE was writable, we only have to make sure that GUP-fast
+ * observes a PTE change and properly backs off.
+ *
+ * If the PTE was not writable, we have to make sure that GUP-fast either
+ * detects a (temporary) PTE change or that PageAnonExclusive is cleared
+ * and properly backs off.
+ *
+ * Consequently, when clearing PageAnonExclusive(), we have to make
+ * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
+ * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
+ * and (B5) happen in the right memory order.
+ *
+ * We assume that there might not be a memory barrier after
+ * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
+ * so we use explicit ones here.
+ */
+
+ /* Paired with the memory barrier in try_grab_folio(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb();
+
+ if (unlikely(page_maybe_dma_pinned(page)))
+ return -EBUSY;
ClearPageAnonExclusive(page);
+
+ /*
+ * This is conceptually a smp_wmb() paired with the smp_rmb() in
+ * gup_must_unshare().
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb__after_atomic();
return 0;
}
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index e650946816d0..303ee7dd0c7e 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -27,6 +27,7 @@ enum sched_tunable_scaling {
#ifdef CONFIG_NUMA_BALANCING
extern int sysctl_numa_balancing_mode;
+extern unsigned int sysctl_numa_balancing_promote_rate_limit;
#else
#define sysctl_numa_balancing_mode 0
#endif
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0fefdf528e0d..352e3f082acc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -119,6 +119,12 @@
*/
#define SLAB_NO_USER_FLAGS ((slab_flags_t __force)0x10000000U)
+#ifdef CONFIG_KFENCE
+#define SLAB_SKIP_KFENCE ((slab_flags_t __force)0x20000000U)
+#else
+#define SLAB_SKIP_KFENCE 0
+#endif
+
/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U)
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index d651f3437367..55392bf30a03 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -11,6 +11,7 @@
EM( SCAN_FAIL, "failed") \
EM( SCAN_SUCCEED, "succeeded") \
EM( SCAN_PMD_NULL, "pmd_null") \
+ EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \
EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6c1aa92a92e4..6ce1f1ceb432 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -77,6 +77,8 @@
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
+#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 7d32b1e797fb..005e5e306266 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -12,6 +12,10 @@
#include <linux/types.h>
+/* ioctls for /dev/userfaultfd */
+#define USERFAULTFD_IOC 0xAA
+#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00)
+
/*
* If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
* UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In
diff --git a/init/main.c b/init/main.c
index 1fe7942f5d4a..2a475d40f952 100644
--- a/init/main.c
+++ b/init/main.c
@@ -849,6 +849,9 @@ static void __init mm_init(void)
pgtable_init();
debug_objects_mem_init();
vmalloc_init();
+ /* Should be run after vmap initialization */
+ if (early_page_ext_enabled())
+ page_ext_init();
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
/* Should be run after espfix64 is set up. */
@@ -1618,7 +1621,8 @@ static noinline void __init kernel_init_freeable(void)
padata_init();
page_alloc_init_late();
/* Initialize page ext after all struct pages are initialized. */
- page_ext_init();
+ if (!early_page_ext_enabled())
+ page_ext_init();
do_basic_setup();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ee28253c9ac0..8fccd8721bb8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4396,6 +4396,17 @@ void set_numabalancing_state(bool enabled)
}
#ifdef CONFIG_PROC_SYSCTL
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
+}
+
int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -4412,6 +4423,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
if (err < 0)
return err;
if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
sysctl_numa_balancing_mode = state;
__set_numabalancing_state(state);
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index bb3d63bdf4ae..ad63dbfc54f1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -333,6 +333,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+ debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 914096c5b1ae..d642e9ff2829 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1094,6 +1094,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/* The page with hint page fault latency < threshold in ms is considered hot */
+unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
struct numa_group {
refcount_t refcount;
@@ -1436,6 +1442,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
+/*
+ * If memory tiering mode is enabled, cpupid of slow memory page is
+ * used to record scan time instead of CPU and PID. When tiering mode
+ * is disabled at run time, the scan time (in cpupid) will be
+ * interpreted as CPU and PID. So CPU needs to be checked to avoid to
+ * access out of array bound.
+ */
+static inline bool cpupid_valid(int cpupid)
+{
+ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long enough_wmark;
+
+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+ pgdat->node_present_pages >> 4);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ wmark_pages(zone, WMARK_PROMO) + enough_wmark,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * For memory tiering mode, when page tables are scanned, the scan
+ * time will be recorded in struct page in addition to make page
+ * PROT_NONE for slow memory page. So when the page is accessed, in
+ * hint page fault handler, the hint page fault latency is calculated
+ * via,
+ *
+ * hint page fault latency = hint page fault time - scan time
+ *
+ * The smaller the hint page fault latency, the higher the possibility
+ * for the page to be hot.
+ */
+static int numa_hint_fault_latency(struct page *page)
+{
+ int last_time, time;
+
+ time = jiffies_to_msecs(jiffies);
+ last_time = xchg_page_access_time(page, time);
+
+ return (time - last_time) & PAGE_ACCESS_TIME_MASK;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
+#define NUMA_MIGRATION_ADJUST_STEPS 16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit,
+ unsigned int ref_th)
+{
+ unsigned int now, start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ now = jiffies_to_msecs(jiffies);
+ th_period = sysctl_numa_balancing_scan_period_max;
+ start = pgdat->nbp_th_start;
+ if (now - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+ ref_cand = rate_limit *
+ sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1443,9 +1563,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * The pages in slow memory node should be migrated according
+ * to hot/cold instead of private/shared.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !node_is_toptier(src_nid)) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+ unsigned int latency, th, def_th;
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
+ return true;
+ }
+
+ def_th = sysctl_numa_balancing_hot_threshold;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
+
+ th = pgdat->nbp_threshold ? : def_th;
+ latency = numa_hint_fault_latency(page);
+ if (latency >= th)
+ return false;
+
+ return !numa_promotion_rate_limit(pgdat, rate_limit,
+ thp_nr_pages(page));
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
+ return false;
+
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
@@ -2685,6 +2840,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
+ /*
+ * NUMA faults statistics are unnecessary for the slow memory
+ * node for memory tiering mode.
+ */
+ if (!node_is_toptier(mem_node) &&
+ (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+ !cpupid_valid(last_cpupid)))
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e26688d387ae..8e914e85ba8e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2452,6 +2452,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_hot_threshold;
#endif
#ifdef CONFIG_SCHED_HRTICK
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 205d605cacc5..f10a610aa834 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1641,6 +1641,14 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_FOUR,
},
+ {
+ .procname = "numa_balancing_promote_rate_limit_MBps",
+ .data = &sysctl_numa_balancing_promote_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
#endif /* CONFIG_NUMA_BALANCING */
{
.procname = "panic",
diff --git a/mm/Kconfig b/mm/Kconfig
index 0331f1461f81..e3fbd0788878 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -23,7 +23,7 @@ menuconfig SWAP
in your computer. If unsure say Y.
config ZSWAP
- bool "Compressed cache for swap pages (EXPERIMENTAL)"
+ bool "Compressed cache for swap pages"
depends on SWAP
select FRONTSWAP
select CRYPTO
@@ -36,12 +36,6 @@ config ZSWAP
in the case where decompressing from RAM is faster than swap device
reads, can also improve workload performance.
- This is marked experimental because it is a new feature (as of
- v3.11) that interacts heavily with memory reclaim. While these
- interactions don't cause any known issues on simple memory setups,
- they have not be fully explored on the large set of potential
- configurations and workloads that exist.
-
config ZSWAP_DEFAULT_ON
bool "Enable the compressed cache for swap pages by default"
depends on ZSWAP
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index de65cb1e5f76..c30419a5e119 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -776,8 +776,6 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
int bdi_init(struct backing_dev_info *bdi)
{
- int ret;
-
bdi->dev = NULL;
kref_init(&bdi->refcnt);
@@ -788,9 +786,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
- ret = cgwb_bdi_init(bdi);
-
- return ret;
+ return cgwb_bdi_init(bdi);
}
struct backing_dev_info *bdi_alloc(int node_id)
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index c3ffe253e055..602fff89b15f 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -163,11 +163,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
- char name[CMA_MAX_NAME];
- scnprintf(name, sizeof(name), "cma-%s", cma->name);
-
- tmp = debugfs_create_dir(name, root_dentry);
+ tmp = debugfs_create_dir(cma->name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
diff --git a/mm/compaction.c b/mm/compaction.c
index 640fa76228dd..262c4676b32c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1981,9 +1981,21 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
+/*
+ * Determine whether kswapd is (or recently was!) running on this node.
+ *
+ * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't
+ * zero it.
+ */
static bool kswapd_is_running(pg_data_t *pgdat)
{
- return pgdat->kswapd && task_is_running(pgdat->kswapd);
+ bool running;
+
+ pgdat_kswapd_lock(pgdat);
+ running = pgdat->kswapd && task_is_running(pgdat->kswapd);
+ pgdat_kswapd_unlock(pgdat);
+
+ return running;
}
/*
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 573669566f84..45db79d28fdc 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -126,7 +126,7 @@ static void damon_test_split_at(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 100);
damon_add_region(r, t);
- damon_split_region_at(c, t, r, 25);
+ damon_split_region_at(t, r, 25);
KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
KUNIT_EXPECT_EQ(test, r->ar.end, 25ul);
@@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 22);
damon_add_region(r, t);
- damon_split_regions_of(c, t, 2);
+ damon_split_regions_of(t, 2);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target();
r = damon_new_region(0, 220);
damon_add_region(r, t);
- damon_split_regions_of(c, t, 4);
+ damon_split_regions_of(t, 4);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
damon_destroy_ctx(c);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7d25dc582fe3..9964b9d00768 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -658,9 +658,8 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
}
}
-static void damon_split_region_at(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- unsigned long sz_r);
+static void damon_split_region_at(struct damon_target *t,
+ struct damon_region *r, unsigned long sz_r);
static bool __damos_valid_target(struct damon_region *r, struct damos *s)
{
@@ -726,7 +725,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
continue;
sz = DAMON_MIN_REGION;
}
- damon_split_region_at(c, t, r, sz);
+ damon_split_region_at(t, r, sz);
r = damon_next_region(r);
sz = r->ar.end - r->ar.start;
}
@@ -745,7 +744,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
DAMON_MIN_REGION);
if (!sz)
goto update_stat;
- damon_split_region_at(c, t, r, sz);
+ damon_split_region_at(t, r, sz);
}
ktime_get_coarse_ts64(&begin);
sz_applied = c->ops.apply_scheme(c, t, r, s);
@@ -928,9 +927,8 @@ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
* r the region to be split
* sz_r size of the first sub-region that will be made
*/
-static void damon_split_region_at(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- unsigned long sz_r)
+static void damon_split_region_at(struct damon_target *t,
+ struct damon_region *r, unsigned long sz_r)
{
struct damon_region *new;
@@ -947,8 +945,7 @@ static void damon_split_region_at(struct damon_ctx *ctx,
}
/* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_ctx *ctx,
- struct damon_target *t, int nr_subs)
+static void damon_split_regions_of(struct damon_target *t, int nr_subs)
{
struct damon_region *r, *next;
unsigned long sz_region, sz_sub = 0;
@@ -969,7 +966,7 @@ static void damon_split_regions_of(struct damon_ctx *ctx,
if (sz_sub == 0 || sz_sub >= sz_region)
continue;
- damon_split_region_at(ctx, t, r, sz_sub);
+ damon_split_region_at(t, r, sz_sub);
sz_region = sz_sub;
}
}
@@ -1004,7 +1001,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
nr_subregions = 3;
damon_for_each_target(t, ctx)
- damon_split_regions_of(ctx, t, nr_subregions);
+ damon_split_regions_of(t, nr_subregions);
last_nr_regions = nr_regions;
}
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 4e51466c4e74..652a94deafe3 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -1053,7 +1053,7 @@ static int __init __damon_dbgfs_init(void)
fops[i]);
dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
- dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL);
+ dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL);
if (!dbgfs_dirs) {
debugfs_remove(dbgfs_root);
return -ENOMEM;
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index b1335de200e7..f599838b5f64 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -88,7 +88,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
#define DAMON_MAX_SUBSCORE (100)
#define DAMON_MAX_AGE_IN_LOG (32)
-int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s)
{
unsigned int max_nr_accesses;
@@ -127,48 +127,14 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
*/
hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
- /* Return coldness of the region */
- return DAMOS_MAX_SCORE - hotness;
+ return hotness;
}
-int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s)
{
- unsigned int max_nr_accesses;
- int freq_subscore;
- unsigned int age_in_sec;
- int age_in_log, age_subscore;
- unsigned int freq_weight = s->quota.weight_nr_accesses;
- unsigned int age_weight = s->quota.weight_age;
- int hotness;
-
- max_nr_accesses = c->aggr_interval / c->sample_interval;
- freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+ int hotness = damon_hot_score(c, r, s);
- age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
- for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
- age_in_log++, age_in_sec >>= 1)
- ;
-
- /* If frequency is 0, higher age means it's colder */
- if (freq_subscore == 0)
- age_in_log *= -1;
-
- /*
- * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
- * Scale it to be in [0, 100] and set it as age subscore.
- */
- age_in_log += DAMON_MAX_AGE_IN_LOG;
- age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
- DAMON_MAX_AGE_IN_LOG / 2;
-
- hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
- if (freq_weight + age_weight)
- hotness /= freq_weight + age_weight;
- /*
- * Transform it to fit in [0, DAMOS_MAX_SCORE]
- */
- hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
-
- return hotness;
+ /* Return coldness of the region */
+ return DAMOS_MAX_SCORE - hotness;
}
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index dc131c6a5403..6b0d9e6aa677 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -166,8 +166,7 @@ out:
return result.accessed;
}
-static void __damon_pa_check_access(struct damon_ctx *ctx,
- struct damon_region *r)
+static void __damon_pa_check_access(struct damon_region *r)
{
static unsigned long last_addr;
static unsigned long last_page_sz = PAGE_SIZE;
@@ -196,7 +195,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t) {
- __damon_pa_check_access(ctx, r);
+ __damon_pa_check_access(r);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 3c7b9d6dca95..a8505ad47c60 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -302,9 +302,14 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd)) {
ptl = pmd_lock(walk->mm, pmd);
- if (pmd_huge(*pmd)) {
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_huge(*pmd)) {
damon_pmdp_mkold(pmd, walk->mm, addr);
spin_unlock(ptl);
return 0;
@@ -429,9 +434,14 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
struct damon_young_walk_private *priv = walk->private;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd)) {
ptl = pmd_lock(walk->mm, pmd);
- if (!pmd_huge(*pmd)) {
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (!pmd_trans_huge(*pmd)) {
spin_unlock(ptl);
goto regular_page;
}
@@ -532,16 +542,15 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
* mm 'mm_struct' for the given virtual address space
* r the region to be checked
*/
-static void __damon_va_check_access(struct damon_ctx *ctx,
- struct mm_struct *mm, struct damon_region *r)
+static void __damon_va_check_access(struct mm_struct *mm,
+ struct damon_region *r, bool same_target)
{
- static struct mm_struct *last_mm;
static unsigned long last_addr;
static unsigned long last_page_sz = PAGE_SIZE;
static bool last_accessed;
/* If the region is in the last checked page, reuse the result */
- if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) ==
+ if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) ==
ALIGN_DOWN(r->sampling_addr, last_page_sz))) {
if (last_accessed)
r->nr_accesses++;
@@ -552,7 +561,6 @@ static void __damon_va_check_access(struct damon_ctx *ctx,
if (last_accessed)
r->nr_accesses++;
- last_mm = mm;
last_addr = r->sampling_addr;
}
@@ -562,14 +570,17 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
struct mm_struct *mm;
struct damon_region *r;
unsigned int max_nr_accesses = 0;
+ bool same_target;
damon_for_each_target(t, ctx) {
mm = damon_get_mm(t);
if (!mm)
continue;
+ same_target = false;
damon_for_each_region(r, t) {
- __damon_va_check_access(ctx, mm, r);
+ __damon_va_check_access(mm, r, same_target);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+ same_target = true;
}
mmput(mm);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 15800334147b..8151890e9a00 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1633,24 +1633,26 @@ EXPORT_SYMBOL(folio_end_writeback);
*/
void page_endio(struct page *page, bool is_write, int err)
{
+ struct folio *folio = page_folio(page);
+
if (!is_write) {
if (!err) {
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
} else {
- ClearPageUptodate(page);
- SetPageError(page);
+ folio_clear_uptodate(folio);
+ folio_set_error(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
} else {
if (err) {
struct address_space *mapping;
- SetPageError(page);
- mapping = page_mapping(page);
+ folio_set_error(folio);
+ mapping = folio_mapping(folio);
if (mapping)
mapping_set_error(mapping, err);
}
- end_page_writeback(page);
+ folio_end_writeback(folio);
}
}
EXPORT_SYMBOL_GPL(page_endio);
@@ -2195,30 +2197,31 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
}
/**
- * find_get_pages_contig - gang contiguous pagecache lookup
+ * filemap_get_folios_contig - Get a batch of contiguous folios
* @mapping: The address_space to search
- * @index: The starting page index
- * @nr_pages: The maximum number of pages
- * @pages: Where the resulting pages are placed
+ * @start: The starting page index
+ * @end: The final page index (inclusive)
+ * @fbatch: The batch to fill
*
- * find_get_pages_contig() works exactly like find_get_pages_range(),
- * except that the returned number of pages are guaranteed to be
- * contiguous.
+ * filemap_get_folios_contig() works exactly like filemap_get_folios(),
+ * except the returned folios are guaranteed to be contiguous. This may
+ * not return all contiguous folios if the batch gets filled up.
*
- * Return: the number of pages which were found.
+ * Return: The number of folios found.
+ * Also update @start to be positioned for traversal of the next folio.
*/
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
- unsigned int nr_pages, struct page **pages)
+
+unsigned filemap_get_folios_contig(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
{
- XA_STATE(xas, &mapping->i_pages, index);
+ XA_STATE(xas, &mapping->i_pages, *start);
+ unsigned long nr;
struct folio *folio;
- unsigned int ret = 0;
-
- if (unlikely(!nr_pages))
- return 0;
rcu_read_lock();
- for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
+
+ for (folio = xas_load(&xas); folio && xas.xa_index <= end;
+ folio = xas_next(&xas)) {
if (xas_retry(&xas, folio))
continue;
/*
@@ -2226,33 +2229,45 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
* No current caller is looking for DAX entries.
*/
if (xa_is_value(folio))
- break;
+ goto update_start;
if (!folio_try_get_rcu(folio))
goto retry;
if (unlikely(folio != xas_reload(&xas)))
- goto put_page;
+ goto put_folio;
-again:
- pages[ret] = folio_file_page(folio, xas.xa_index);
- if (++ret == nr_pages)
- break;
- if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) {
- xas.xa_index++;
- folio_ref_inc(folio);
- goto again;
+ if (!folio_batch_add(fbatch, folio)) {
+ nr = folio_nr_pages(folio);
+
+ if (folio_test_hugetlb(folio))
+ nr = 1;
+ *start = folio->index + nr;
+ goto out;
}
continue;
-put_page:
+put_folio:
folio_put(folio);
+
retry:
xas_reset(&xas);
}
+
+update_start:
+ nr = folio_batch_count(fbatch);
+
+ if (nr) {
+ folio = fbatch->folios[nr - 1];
+ if (folio_test_hugetlb(folio))
+ *start = folio->index + 1;
+ else
+ *start = folio->index + folio_nr_pages(folio);
+ }
+out:
rcu_read_unlock();
- return ret;
+ return folio_batch_count(fbatch);
}
-EXPORT_SYMBOL(find_get_pages_contig);
+EXPORT_SYMBOL(filemap_get_folios_contig);
/**
* find_get_pages_range_tag - Find and return head pages matching @tag.
diff --git a/mm/gup.c b/mm/gup.c
index 00926abb4426..d4f706dc245f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -158,6 +158,13 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
else
folio_ref_add(folio,
refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in page_try_share_anon_rmap().
+ */
+ smp_mb__after_atomic();
+
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
return folio;
@@ -1927,20 +1934,16 @@ struct page *get_dump_page(unsigned long addr)
#ifdef CONFIG_MIGRATION
/*
- * Check whether all pages are pinnable, if so return number of pages. If some
- * pages are not pinnable, migrate them, and unpin all pages. Return zero if
- * pages were migrated, or if some pages were not successfully isolated.
- * Return negative error if migration fails.
+ * Returns the number of collected pages. Return value is always >= 0.
*/
-static long check_and_migrate_movable_pages(unsigned long nr_pages,
- struct page **pages,
- unsigned int gup_flags)
+static unsigned long collect_longterm_unpinnable_pages(
+ struct list_head *movable_page_list,
+ unsigned long nr_pages,
+ struct page **pages)
{
- unsigned long isolation_error_count = 0, i;
+ unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
- LIST_HEAD(movable_page_list);
- bool drain_allow = true, coherent_pages = false;
- int ret = 0;
+ bool drain_allow = true;
for (i = 0; i < nr_pages; i++) {
struct folio *folio = page_folio(pages[i]);
@@ -1949,45 +1952,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
continue;
prev_folio = folio;
- /*
- * Device coherent pages are managed by a driver and should not
- * be pinned indefinitely as it prevents the driver moving the
- * page. So when trying to pin with FOLL_LONGTERM instead try
- * to migrate the page out of device memory.
- */
- if (folio_is_device_coherent(folio)) {
- /*
- * We always want a new GUP lookup with device coherent
- * pages.
- */
- pages[i] = 0;
- coherent_pages = true;
-
- /*
- * Migration will fail if the page is pinned, so convert
- * the pin on the source page to a normal reference.
- */
- if (gup_flags & FOLL_PIN) {
- get_page(&folio->page);
- unpin_user_page(&folio->page);
- }
+ if (folio_is_longterm_pinnable(folio))
+ continue;
- ret = migrate_device_coherent_page(&folio->page);
- if (ret)
- goto unpin_pages;
+ collected++;
+ if (folio_is_device_coherent(folio))
continue;
- }
- if (folio_is_longterm_pinnable(folio))
- continue;
- /*
- * Try to move out any movable page before pinning the range.
- */
if (folio_test_hugetlb(folio)) {
- if (isolate_hugetlb(&folio->page,
- &movable_page_list))
- isolation_error_count++;
+ isolate_hugetlb(&folio->page, movable_page_list);
continue;
}
@@ -1996,63 +1970,124 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
drain_allow = false;
}
- if (folio_isolate_lru(folio)) {
- isolation_error_count++;
+ if (!folio_isolate_lru(folio))
continue;
- }
- list_add_tail(&folio->lru, &movable_page_list);
+
+ list_add_tail(&folio->lru, movable_page_list);
node_stat_mod_folio(folio,
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
- if (!list_empty(&movable_page_list) || isolation_error_count ||
- coherent_pages)
- goto unpin_pages;
+ return collected;
+}
- /*
- * If list is empty, and no isolation errors, means that all pages are
- * in the correct zone.
- */
- return nr_pages;
+/*
+ * Unpins all pages and migrates device coherent pages and movable_page_list.
+ * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
+ * (or partial success).
+ */
+static int migrate_longterm_unpinnable_pages(
+ struct list_head *movable_page_list,
+ unsigned long nr_pages,
+ struct page **pages)
+{
+ int ret;
+ unsigned long i;
-unpin_pages:
- /*
- * pages[i] might be NULL if any device coherent pages were found.
- */
for (i = 0; i < nr_pages; i++) {
- if (!pages[i])
+ struct folio *folio = page_folio(pages[i]);
+
+ if (folio_is_device_coherent(folio)) {
+ /*
+ * Migration will fail if the page is pinned, so convert
+ * the pin on the source page to a normal reference.
+ */
+ pages[i] = NULL;
+ folio_get(folio);
+ gup_put_folio(folio, 1, FOLL_PIN);
+
+ if (migrate_device_coherent_page(&folio->page)) {
+ ret = -EBUSY;
+ goto err;
+ }
+
continue;
+ }
- if (gup_flags & FOLL_PIN)
- unpin_user_page(pages[i]);
- else
- put_page(pages[i]);
+ /*
+ * We can't migrate pages with unexpected references, so drop
+ * the reference obtained by __get_user_pages_locked().
+ * Migrating pages have been added to movable_page_list after
+ * calling folio_isolate_lru() which takes a reference so the
+ * page won't be freed if it's migrating.
+ */
+ unpin_user_page(pages[i]);
+ pages[i] = NULL;
}
- if (!list_empty(&movable_page_list)) {
+ if (!list_empty(movable_page_list)) {
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
.gfp_mask = GFP_USER | __GFP_NOWARN,
};
- ret = migrate_pages(&movable_page_list, alloc_migration_target,
- NULL, (unsigned long)&mtc, MIGRATE_SYNC,
- MR_LONGTERM_PIN, NULL);
- if (ret > 0) /* number of pages not migrated */
+ if (migrate_pages(movable_page_list, alloc_migration_target,
+ NULL, (unsigned long)&mtc, MIGRATE_SYNC,
+ MR_LONGTERM_PIN, NULL)) {
ret = -ENOMEM;
+ goto err;
+ }
}
- if (ret && !list_empty(&movable_page_list))
- putback_movable_pages(&movable_page_list);
+ putback_movable_pages(movable_page_list);
+
+ return -EAGAIN;
+
+err:
+ for (i = 0; i < nr_pages; i++)
+ if (pages[i])
+ unpin_user_page(pages[i]);
+ putback_movable_pages(movable_page_list);
+
return ret;
}
+
+/*
+ * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
+ * pages in the range are required to be pinned via FOLL_PIN, before calling
+ * this routine.
+ *
+ * If any pages in the range are not allowed to be pinned, then this routine
+ * will migrate those pages away, unpin all the pages in the range and return
+ * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
+ * call this routine again.
+ *
+ * If an error other than -EAGAIN occurs, this indicates a migration failure.
+ * The caller should give up, and propagate the error back up the call stack.
+ *
+ * If everything is OK and all pages in the range are allowed to be pinned, then
+ * this routine leaves all pages pinned and returns zero for success.
+ */
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+ struct page **pages)
+{
+ unsigned long collected;
+ LIST_HEAD(movable_page_list);
+
+ collected = collect_longterm_unpinnable_pages(&movable_page_list,
+ nr_pages, pages);
+ if (!collected)
+ return 0;
+
+ return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
+ pages);
+}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
- struct page **pages,
- unsigned int gup_flags)
+ struct page **pages)
{
- return nr_pages;
+ return 0;
}
#endif /* CONFIG_MIGRATION */
@@ -2068,22 +2103,36 @@ static long __gup_longterm_locked(struct mm_struct *mm,
unsigned int gup_flags)
{
unsigned int flags;
- long rc;
+ long rc, nr_pinned_pages;
if (!(gup_flags & FOLL_LONGTERM))
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
NULL, gup_flags);
+
+ /*
+ * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM
+ * implies FOLL_PIN (although the reverse is not true). Therefore it is
+ * correct to unconditionally call check_and_migrate_movable_pages()
+ * which assumes pages have been pinned via FOLL_PIN.
+ *
+ * Enforce the above reasoning by asserting that FOLL_PIN is set.
+ */
+ if (WARN_ON(!(gup_flags & FOLL_PIN)))
+ return -EINVAL;
flags = memalloc_pin_save();
do {
- rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- NULL, gup_flags);
- if (rc <= 0)
+ nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
+ pages, vmas, NULL,
+ gup_flags);
+ if (nr_pinned_pages <= 0) {
+ rc = nr_pinned_pages;
break;
- rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
- } while (!rc);
+ }
+ rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
+ } while (rc == -EAGAIN);
memalloc_pin_restore(flags);
- return rc;
+ return rc ? rc : nr_pinned_pages;
}
static bool is_valid_gup_flags(unsigned int gup_flags)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f42bb51e023a..7bf2299cb24b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -70,9 +70,8 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf)
+bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
+ bool smaps, bool in_pf, bool enforce_sysfs)
{
if (!vma->vm_mm) /* vdso */
return false;
@@ -121,11 +120,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
if (!in_pf && shmem_file(vma->vm_file))
return shmem_huge_enabled(vma);
- if (!hugepage_flags_enabled())
- return false;
-
- /* THP settings require madvise. */
- if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
+ /* Enforce sysfs THP requirements as necessary */
+ if (enforce_sysfs &&
+ (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
+ !hugepage_flags_always())))
return false;
/* Only regular file is valid */
@@ -772,8 +770,7 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
- if (pgtable)
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
}
@@ -1479,7 +1476,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = NUMA_NO_NODE;
- int target_nid, last_cpupid = -1;
+ int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
bool migrated = false;
bool was_writable = pmd_savedwrite(oldpmd);
int flags = 0;
@@ -1500,7 +1497,12 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
flags |= TNF_NO_GROUP;
page_nid = page_to_nid(page);
- last_cpupid = page_cpupid_last(page);
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if (node_is_toptier(page_nid))
+ last_cpupid = page_cpupid_last(page);
target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
&flags);
@@ -1824,6 +1826,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (prot_numa) {
struct page *page;
+ bool toptier;
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -1836,13 +1839,18 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto unlock;
page = pmd_page(*pmd);
+ toptier = node_is_toptier(page_to_nid(page));
/*
* Skip scanning top tier node if normal numa
* balancing is disabled
*/
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- node_is_toptier(page_to_nid(page)))
+ toptier)
goto unlock;
+
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !toptier)
+ xchg_page_access_time(page, jiffies_to_msecs(jiffies));
}
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
@@ -2140,6 +2148,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*
* In case we cannot clear PageAnonExclusive(), split the PMD
* only and let try_to_migrate_one() fail later.
+ *
+ * See page_try_share_anon_rmap(): invalidate PMD first.
*/
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
@@ -2288,25 +2298,11 @@ out:
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
bool freeze, struct folio *folio)
{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return;
+ pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
- p4d = p4d_offset(pgd, address);
- if (!p4d_present(*p4d))
+ if (!pmd)
return;
- pud = pud_offset(p4d, address);
- if (!pud_present(*pud))
- return;
-
- pmd = pmd_offset(pud, address);
-
__split_huge_pmd(vma, pmd, address, freeze, folio);
}
@@ -2649,6 +2645,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
+ gfp_t gfp;
+
mapping = head->mapping;
/* Truncated ? */
@@ -2657,8 +2655,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}
- xas_split_alloc(&xas, head, compound_order(head),
- mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
+ gfp = current_gfp_context(mapping_gfp_mask(mapping) &
+ GFP_RECLAIM_MASK);
+
+ if (folio_test_private(folio) &&
+ !filemap_release_folio(folio, gfp)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ xas_split_alloc(&xas, head, compound_order(head), gfp);
if (xas_error(&xas)) {
ret = xas_error(&xas);
goto out;
@@ -3175,6 +3181,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ /* See page_try_share_anon_rmap(): invalidate PMD first. */
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pmd_at(mm, address, pvmw->pmd, pmdval);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0bdfc7e1c933..2ca4e8c3163e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -456,14 +456,12 @@ static int allocate_file_region_entries(struct resv_map *resv,
int regions_needed)
__must_hold(&resv->lock)
{
- struct list_head allocated_regions;
+ LIST_HEAD(allocated_regions);
int to_allocate = 0, i = 0;
struct file_region *trg = NULL, *rg = NULL;
VM_BUG_ON(regions_needed < 0);
- INIT_LIST_HEAD(&allocated_regions);
-
/*
* Check for sufficient descriptors in the cache to accommodate
* the number of in progress add operations plus regions_needed.
@@ -1506,6 +1504,10 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
set_page_private(page, 0);
+ /*
+ * We have to set HPageVmemmapOptimized again as above
+ * set_page_private(page, 0) cleared it.
+ */
SetHPageVmemmapOptimized(page);
/*
@@ -2336,7 +2338,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
- struct list_head surplus_list;
+ LIST_HEAD(surplus_list);
struct page *page, *tmp;
int ret;
long i;
@@ -2351,7 +2353,6 @@ static int gather_surplus_pages(struct hstate *h, long delta)
}
allocated = 0;
- INIT_LIST_HEAD(&surplus_list);
ret = -ENOMEM;
retry:
@@ -3474,7 +3475,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
* based on pool changes for the demoted page.
*/
h->max_huge_pages--;
- target_hstate->max_huge_pages += pages_per_huge_page(h);
+ target_hstate->max_huge_pages +=
+ pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
return rc;
}
@@ -3767,8 +3769,7 @@ HSTATE_ATTR_WO(demote);
static ssize_t demote_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int nid;
- struct hstate *h = kobj_to_hstate(kobj, &nid);
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
return sysfs_emit(buf, "%lukB\n", demote_size);
@@ -3781,7 +3782,6 @@ static ssize_t demote_size_store(struct kobject *kobj,
struct hstate *h, *demote_hstate;
unsigned long demote_size;
unsigned int demote_order;
- int nid;
demote_size = (unsigned long)memparse(buf, NULL);
@@ -3793,7 +3793,7 @@ static ssize_t demote_size_store(struct kobject *kobj,
return -EINVAL;
/* demote order must be smaller than hstate order */
- h = kobj_to_hstate(kobj, &nid);
+ h = kobj_to_hstate(kobj, NULL);
if (demote_order >= h->order)
return -EINVAL;
@@ -3847,15 +3847,22 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
if (retval) {
kobject_put(hstate_kobjs[hi]);
hstate_kobjs[hi] = NULL;
+ return retval;
}
if (h->demote_order) {
- if (sysfs_create_group(hstate_kobjs[hi],
- &hstate_demote_attr_group))
+ retval = sysfs_create_group(hstate_kobjs[hi],
+ &hstate_demote_attr_group);
+ if (retval) {
pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+ sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
+ kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ return retval;
+ }
}
- return retval;
+ return 0;
}
static void __init hugetlb_sysfs_init(void)
@@ -3941,10 +3948,15 @@ static void hugetlb_unregister_node(struct node *node)
for_each_hstate(h) {
int idx = hstate_index(h);
- if (nhs->hstate_kobjs[idx]) {
- kobject_put(nhs->hstate_kobjs[idx]);
- nhs->hstate_kobjs[idx] = NULL;
- }
+ struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
+
+ if (!hstate_kobj)
+ continue;
+ if (h->demote_order)
+ sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
+ sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
+ kobject_put(hstate_kobj);
+ nhs->hstate_kobjs[idx] = NULL;
}
kobject_put(nhs->hugepages_kobj);
@@ -4019,6 +4031,14 @@ static void hugetlb_register_all_nodes(void) { }
#endif
+#ifdef CONFIG_CMA
+static void __init hugetlb_cma_check(void);
+#else
+static inline __init void hugetlb_cma_check(void)
+{
+}
+#endif
+
static int __init hugetlb_init(void)
{
int i;
@@ -4118,7 +4138,7 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_alloc = first_memory_node;
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
- huge_page_size(h)/1024);
+ huge_page_size(h)/SZ_1K);
parsed_hstate = h;
}
@@ -4133,11 +4153,11 @@ static void __init hugepages_clear_pages_in_node(void)
if (!hugetlb_max_hstate) {
default_hstate_max_huge_pages = 0;
memset(default_hugepages_in_node, 0,
- MAX_NUMNODES * sizeof(unsigned int));
+ sizeof(default_hugepages_in_node));
} else {
parsed_hstate->max_huge_pages = 0;
memset(parsed_hstate->max_huge_pages_node, 0,
- MAX_NUMNODES * sizeof(unsigned int));
+ sizeof(parsed_hstate->max_huge_pages_node));
}
}
@@ -4332,18 +4352,34 @@ static int __init default_hugepagesz_setup(char *s)
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+ struct mempolicy *mpol = get_task_policy(current);
+
+ /*
+ * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+ * (from policy_nodemask) specifically for hugetlb case
+ */
+ if (mpol->mode == MPOL_BIND &&
+ (apply_policy_zone(mpol, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+ return &mpol->nodes;
+#endif
+ return NULL;
+}
+
static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
- nodemask_t *mpol_allowed;
+ nodemask_t *mbind_nodemask;
unsigned int *array = h->free_huge_pages_node;
gfp_t gfp_mask = htlb_alloc_mask(h);
- mpol_allowed = policy_nodemask_current(gfp_mask);
-
+ mbind_nodemask = policy_mbind_nodemask(gfp_mask);
for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mpol_allowed || node_isset(node, *mpol_allowed))
+ if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
nr += array[node];
}
@@ -4723,7 +4759,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma)
{
- pte_t *src_pte, *dst_pte, entry, dst_entry;
+ pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
@@ -4768,15 +4804,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
/*
* If the pagetables are shared don't copy or take references.
- * dst_pte == src_pte is the common case of src/dest sharing.
*
+ * dst_pte == src_pte is the common case of src/dest sharing.
* However, src could have 'unshared' and dst shares with
- * another vma. If dst_pte !none, this implies sharing.
- * Check here before taking page table lock, and once again
- * after taking the lock below.
+ * another vma. So page_count of ptep page is checked instead
+ * to reliably determine whether pte is shared.
*/
- dst_entry = huge_ptep_get(dst_pte);
- if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+ if (page_count(virt_to_page(dst_pte)) > 1) {
addr |= last_addr_mask;
continue;
}
@@ -4785,13 +4819,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
- dst_entry = huge_ptep_get(dst_pte);
again:
- if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ if (huge_pte_none(entry)) {
/*
- * Skip if src entry none. Also, skip in the
- * unlikely case dst entry !none as this implies
- * sharing with another vma.
+ * Skip if src entry none.
*/
;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
@@ -4870,7 +4901,7 @@ again:
restore_reserve_on_error(h, dst_vma, addr,
new);
put_page(new);
- /* dst_entry won't change as in child */
+ /* huge_ptep of dst_pte won't change as in child */
goto again;
}
hugetlb_install_page(dst_vma, dst_pte, addr, new);
@@ -5316,7 +5347,6 @@ retry_avoidcopy:
u32 hash;
put_page(old_page);
- BUG_ON(huge_pte_none(pte));
/*
* Drop hugetlb_fault_mutex and i_mmap_rwsem before
* unmapping. unmapping needs to hold i_mmap_rwsem
@@ -5408,19 +5438,6 @@ out_release_old:
return ret;
}
-/* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
-{
- struct address_space *mapping;
- pgoff_t idx;
-
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
-
- return find_lock_page(mapping, idx);
-}
-
/*
* Return whether there is a pagecache page to back given address within VMA.
* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
@@ -5547,7 +5564,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (idx >= size)
goto out;
-retry:
new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
@@ -5587,9 +5603,15 @@ retry:
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
if (err) {
+ /*
+ * err can't be -EEXIST which implies someone
+ * else consumed the reservation since hugetlb
+ * fault mutex is held when add a hugetlb page
+ * to the page cache. So it's safe to call
+ * restore_reserve_on_error() here.
+ */
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
- if (err == -EEXIST)
- goto retry;
goto out;
}
new_pagecache_page = true;
@@ -5810,7 +5832,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
+ pagecache_page = find_lock_page(mapping, idx);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -6017,8 +6039,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
page_in_pagecache = true;
}
- ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
- spin_lock(ptl);
+ ptl = huge_pte_lock(h, dst_mm, dst_pte);
/*
* Recheck the i_size after holding PT lock to make sure not
@@ -7334,7 +7355,7 @@ void __init hugetlb_cma_reserve(int order)
hugetlb_cma_size = 0;
}
-void __init hugetlb_cma_check(void)
+static void __init hugetlb_cma_check(void)
{
if (!hugetlb_cma_size || cma_reserve_called)
return;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index c86691c431fd..f61d132df52b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -75,11 +75,11 @@ parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
{
- int idx;
+ struct hstate *h;
- for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+ for_each_hstate(h) {
if (page_counter_read(
- hugetlb_cgroup_counter_from_cgroup(h_cg, idx)))
+ hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
return true;
}
return false;
@@ -154,9 +154,9 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
* function.
*/
for_each_node(node) {
- /* Set node_to_alloc to -1 for offline nodes. */
+ /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
int node_to_alloc =
- node_state(node, N_NORMAL_MEMORY) ? node : -1;
+ node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
h_cgroup->nodeinfo[node] =
kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
GFP_KERNEL, node_to_alloc);
@@ -225,17 +225,14 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
struct page *page;
- int idx;
do {
- idx = 0;
for_each_hstate(h) {
spin_lock_irq(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
- hugetlb_cgroup_move_parent(idx, h_cg, page);
+ hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
spin_unlock_irq(&hugetlb_lock);
- idx++;
}
cond_resched();
} while (hugetlb_cgroup_have_usage(h_cg));
@@ -442,7 +439,7 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
return;
- if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 &&
+ if (rg->reservation_counter && resv->pages_per_hpage &&
!resv->reservation_counter) {
page_counter_uncharge(rg->reservation_counter,
nr_pages * resv->pages_per_hpage);
@@ -675,12 +672,12 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
static char *mem_fmt(char *buf, int size, unsigned long hsize)
{
- if (hsize >= (1UL << 30))
- snprintf(buf, size, "%luGB", hsize >> 30);
- else if (hsize >= (1UL << 20))
- snprintf(buf, size, "%luMB", hsize >> 20);
+ if (hsize >= SZ_1G)
+ snprintf(buf, size, "%luGB", hsize / SZ_1G);
+ else if (hsize >= SZ_1M)
+ snprintf(buf, size, "%luMB", hsize / SZ_1M);
else
- snprintf(buf, size, "%luKB", hsize >> 10);
+ snprintf(buf, size, "%luKB", hsize / SZ_1K);
return buf;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 20f414c0379f..ba2a2596fb4e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -265,11 +265,10 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
static inline void reset_struct_pages(struct page *start)
{
- int i;
struct page *from = start + NR_RESET_STRUCT_PAGE;
- for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
- memcpy(start + i, from, sizeof(*from));
+ BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
+ memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
}
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
@@ -287,6 +286,11 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
copy_page(to, (void *)walk->reuse_addr);
reset_struct_pages(to);
+ /*
+ * Makes sure that preceding stores to the page contents become visible
+ * before the set_pte_at() write.
+ */
+ smp_wmb();
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
}
diff --git a/mm/internal.h b/mm/internal.h
index 785409805ed7..55ce10e4d0c0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -187,7 +187,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
/*
* in mm/rmap.c:
*/
-extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/page_alloc.c
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index c252081b11df..8c08ae2101d7 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -1003,6 +1003,13 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
return NULL;
}
+ /*
+ * Skip allocations for this slab, if KFENCE has been disabled for
+ * this slab.
+ */
+ if (s->flags & SLAB_SKIP_KFENCE)
+ return NULL;
+
if (atomic_inc_return(&kfence_allocation_gate) > 1)
return NULL;
#ifdef CONFIG_KFENCE_STATIC_KEYS
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 70b7ac66411c..0bcba493ebb4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -28,6 +28,7 @@ enum scan_result {
SCAN_FAIL,
SCAN_SUCCEED,
SCAN_PMD_NULL,
+ SCAN_PMD_MAPPED,
SCAN_EXCEED_NONE_PTE,
SCAN_EXCEED_SWAP_PTE,
SCAN_EXCEED_SHARED_PTE,
@@ -73,6 +74,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* default collapse hugepages if there is at least one pte mapped like
* it would have happened if the vma was large enough during page
* fault.
+ *
+ * Note that these are only respected if collapse was initiated by khugepaged.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
@@ -85,6 +88,16 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
#define MAX_PTE_MAPPED_THP 8
+struct collapse_control {
+ bool is_khugepaged;
+
+ /* Num pages scanned per node */
+ u32 node_load[MAX_NUMNODES];
+
+ /* Last target selected in hpage_collapse_find_target_node() */
+ int last_target_node;
+};
+
/**
* struct mm_slot - hash lookup from mm to mm_slot
* @hash: hash collision list
@@ -425,7 +438,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
}
-static inline int khugepaged_test_exit(struct mm_struct *mm)
+static inline int hpage_collapse_test_exit(struct mm_struct *mm)
{
return atomic_read(&mm->mm_users) == 0;
}
@@ -440,7 +453,7 @@ void __khugepaged_enter(struct mm_struct *mm)
return;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
+ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return;
@@ -466,7 +479,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
hugepage_flags_enabled()) {
- if (hugepage_vma_check(vma, vm_flags, false, false))
+ if (hugepage_vma_check(vma, vm_flags, false, false, true))
__khugepaged_enter(vma->vm_mm);
}
}
@@ -492,11 +505,10 @@ void __khugepaged_exit(struct mm_struct *mm)
} else if (mm_slot) {
/*
* This is required to serialize against
- * khugepaged_test_exit() (which is guaranteed to run
- * under mmap sem read mode). Stop here (after we
- * return all pagetables will be destroyed) until
- * khugepaged has finished working on the pagetables
- * under the mmap_lock.
+ * hpage_collapse_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we return all
+ * pagetables will be destroyed) until khugepaged has finished
+ * working on the pagetables under the mmap_lock.
*/
mmap_write_lock(mm);
mmap_write_unlock(mm);
@@ -546,11 +558,12 @@ static bool is_refcount_suitable(struct page *page)
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
+ struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
bool writable = false;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
@@ -558,8 +571,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
+ ++none_or_zero;
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -579,11 +594,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageAnon(page), page);
- if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
- result = SCAN_EXCEED_SHARED_PTE;
- count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
- goto out;
+ if (page_mapcount(page) > 1) {
+ ++shared;
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ goto out;
+ }
}
if (PageCompound(page)) {
@@ -646,10 +664,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (PageCompound(page))
list_add_tail(&page->lru, compound_pagelist);
next:
- /* There should be enough young pte to collapse the page */
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
+ /*
+ * If collapse was initiated by khugepaged, check that there is
+ * enough young pte to justify collapsing the page
+ */
+ if (cc->is_khugepaged &&
+ (pte_young(pteval) || page_is_young(page) ||
+ PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ address)))
referenced++;
if (pte_write(pteval))
@@ -658,19 +680,19 @@ next:
if (unlikely(!writable)) {
result = SCAN_PAGE_RO;
- } else if (unlikely(!referenced)) {
+ } else if (unlikely(cc->is_khugepaged && !referenced)) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
- return 1;
+ return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
- return 0;
+ return result;
}
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -735,9 +757,12 @@ static void khugepaged_alloc_sleep(void)
remove_wait_queue(&khugepaged_wait, &wait);
}
-static int khugepaged_node_load[MAX_NUMNODES];
+struct collapse_control khugepaged_collapse_control = {
+ .is_khugepaged = true,
+ .last_target_node = NUMA_NO_NODE,
+};
-static bool khugepaged_scan_abort(int nid)
+static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
{
int i;
@@ -749,11 +774,11 @@ static bool khugepaged_scan_abort(int nid)
return false;
/* If there is a count for this node already, it must be acceptable */
- if (khugepaged_node_load[nid])
+ if (cc->node_load[nid])
return false;
for (i = 0; i < MAX_NUMNODES; i++) {
- if (!khugepaged_node_load[i])
+ if (!cc->node_load[i])
continue;
if (node_distance(nid, i) > node_reclaim_distance)
return true;
@@ -772,146 +797,62 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
}
#ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
- static int last_khugepaged_target_node = NUMA_NO_NODE;
int nid, target_node = 0, max_value = 0;
/* find first node with max normal pages hit */
for (nid = 0; nid < MAX_NUMNODES; nid++)
- if (khugepaged_node_load[nid] > max_value) {
- max_value = khugepaged_node_load[nid];
+ if (cc->node_load[nid] > max_value) {
+ max_value = cc->node_load[nid];
target_node = nid;
}
/* do some balance if several nodes have the same hit record */
- if (target_node <= last_khugepaged_target_node)
- for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
- nid++)
- if (max_value == khugepaged_node_load[nid]) {
+ if (target_node <= cc->last_target_node)
+ for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == cc->node_load[nid]) {
target_node = nid;
break;
}
- last_khugepaged_target_node = target_node;
+ cc->last_target_node = target_node;
return target_node;
}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+#else
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
- if (IS_ERR(*hpage)) {
- if (!*wait)
- return false;
-
- *wait = false;
- *hpage = NULL;
- khugepaged_alloc_sleep();
- } else if (*hpage) {
- put_page(*hpage);
- *hpage = NULL;
- }
-
- return true;
+ return 0;
}
+#endif
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node)
{
- VM_BUG_ON_PAGE(*hpage, *hpage);
-
*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- *hpage = ERR_PTR(-ENOMEM);
- return NULL;
+ return false;
}
prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
- return *hpage;
-}
-#else
-static int khugepaged_find_target_node(void)
-{
- return 0;
-}
-
-static inline struct page *alloc_khugepaged_hugepage(void)
-{
- struct page *page;
-
- page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
- HPAGE_PMD_ORDER);
- if (page)
- prep_transhuge_page(page);
- return page;
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
- struct page *hpage;
-
- do {
- hpage = alloc_khugepaged_hugepage();
- if (!hpage) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- if (!*wait)
- return NULL;
-
- *wait = false;
- khugepaged_alloc_sleep();
- } else
- count_vm_event(THP_COLLAPSE_ALLOC);
- } while (unlikely(!hpage) && likely(hugepage_flags_enabled()));
-
- return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
- /*
- * If the hpage allocated earlier was briefly exposed in page cache
- * before collapse_file() failed, it is possible that racing lookups
- * have not yet completed, and would then be unpleasantly surprised by
- * finding the hpage reused for the same mapping at a different offset.
- * Just release the previous allocation if there is any danger of that.
- */
- if (*hpage && page_count(*hpage) > 1) {
- put_page(*hpage);
- *hpage = NULL;
- }
-
- if (!*hpage)
- *hpage = khugepaged_alloc_hugepage(wait);
-
- if (unlikely(!*hpage))
- return false;
-
return true;
}
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
-{
- VM_BUG_ON(!*hpage);
-
- return *hpage;
-}
-#endif
-
/*
* If mmap_lock temporarily dropped, revalidate vma
* before taking mmap_lock.
- * Return 0 if succeeds, otherwise return none-zero
- * value (scan code).
+ * Returns enum scan_result value.
*/
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
- struct vm_area_struct **vmap)
+ struct vm_area_struct **vmap,
+ struct collapse_control *cc)
{
struct vm_area_struct *vma;
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
@@ -920,7 +861,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!transhuge_vma_suitable(vma, address))
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false))
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
+ cc->is_khugepaged))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -931,21 +873,60 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
*/
if (!vma->anon_vma || !vma_is_anonymous(vma))
return SCAN_VMA_CHECK;
- return 0;
+ return SCAN_SUCCEED;
+}
+
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t **pmd)
+{
+ pmd_t pmde;
+
+ *pmd = mm_find_pmd(mm, address);
+ if (!*pmd)
+ return SCAN_PMD_NULL;
+
+ pmde = pmd_read_atomic(*pmd);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
+ barrier();
+#endif
+ if (!pmd_present(pmde))
+ return SCAN_PMD_NULL;
+ if (pmd_trans_huge(pmde))
+ return SCAN_PMD_MAPPED;
+ if (pmd_bad(pmde))
+ return SCAN_PMD_NULL;
+ return SCAN_SUCCEED;
+}
+
+static int check_pmd_still_valid(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmd)
+{
+ pmd_t *new_pmd;
+ int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
+
+ if (result != SCAN_SUCCEED)
+ return result;
+ if (new_pmd != pmd)
+ return SCAN_FAIL;
+ return SCAN_SUCCEED;
}
/*
* Bring missing pages in from swap, to complete THP collapse.
- * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held.
* Note that if false is returned, mmap_lock will be released.
*/
-static bool __collapse_huge_page_swapin(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
- int referenced)
+static int __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
@@ -976,12 +957,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
*/
if (ret & VM_FAULT_RETRY) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
+ /* Likely, but not guaranteed, that page lock failed */
+ return SCAN_PAGE_LOCK;
}
if (ret & VM_FAULT_ERROR) {
mmap_read_unlock(mm);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
+ return SCAN_FAIL;
}
swapped_in++;
}
@@ -991,30 +973,41 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
lru_add_drain();
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
- return true;
+ return SCAN_SUCCEED;
}
-static void collapse_huge_page(struct mm_struct *mm,
- unsigned long address,
- struct page **hpage,
- int node, int referenced, int unmapped)
+static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
+ struct collapse_control *cc)
+{
+ /* Only allocate from the target node */
+ gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
+ GFP_TRANSHUGE) | __GFP_THISNODE;
+ int node = hpage_collapse_find_target_node(cc);
+
+ if (!hpage_collapse_alloc_page(hpage, gfp, node))
+ return SCAN_ALLOC_HUGE_PAGE_FAIL;
+ if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
+ return SCAN_CGROUP_CHARGE_FAIL;
+ count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+ return SCAN_SUCCEED;
+}
+
+static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
+ int referenced, int unmapped,
+ struct collapse_control *cc)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
- struct page *new_page;
+ struct page *hpage;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated = 0, result = 0;
+ int result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
- gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- /* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
/*
* Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
@@ -1022,40 +1015,34 @@ static void collapse_huge_page(struct mm_struct *mm,
* that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
- new_page = khugepaged_alloc_page(hpage, gfp, node);
- if (!new_page) {
- result = SCAN_ALLOC_HUGE_PAGE_FAIL;
- goto out_nolock;
- }
- if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
- result = SCAN_CGROUP_CHARGE_FAIL;
+ result = alloc_charge_hpage(&hpage, mm, cc);
+ if (result != SCAN_SUCCEED)
goto out_nolock;
- }
- count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, &vma);
- if (result) {
+ result = hugepage_vma_revalidate(mm, address, &vma, cc);
+ if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
- result = SCAN_PMD_NULL;
+ result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- /*
- * __collapse_huge_page_swapin will return with mmap_lock released
- * when it fails. So we jump out_nolock directly in that case.
- * Continuing to collapse causes inconsistency.
- */
- if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
- pmd, referenced)) {
- goto out_nolock;
+ if (unmapped) {
+ /*
+ * __collapse_huge_page_swapin will return with mmap_lock
+ * released when it fails. So we jump out_nolock directly in
+ * that case. Continuing to collapse causes inconsistency.
+ */
+ result = __collapse_huge_page_swapin(mm, vma, address, pmd,
+ referenced);
+ if (result != SCAN_SUCCEED)
+ goto out_nolock;
}
mmap_read_unlock(mm);
@@ -1065,11 +1052,12 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, &vma);
- if (result)
+ result = hugepage_vma_revalidate(mm, address, &vma, cc);
+ if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
- if (mm_find_pmd(mm, address) != pmd)
+ result = check_pmd_still_valid(mm, address, pmd);
+ if (result != SCAN_SUCCEED)
goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
@@ -1095,11 +1083,11 @@ static void collapse_huge_page(struct mm_struct *mm,
mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
- isolated = __collapse_huge_page_isolate(vma, address, pte,
- &compound_pagelist);
+ result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ &compound_pagelist);
spin_unlock(pte_ptl);
- if (unlikely(!isolated)) {
+ if (unlikely(result != SCAN_SUCCEED)) {
pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
@@ -1111,7 +1099,6 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
- result = SCAN_FAIL;
goto out_up_write;
}
@@ -1121,8 +1108,8 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
- &compound_pagelist);
+ __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
+ &compound_pagelist);
pte_unmap(pte);
/*
* spin_lock() below is not the equivalent of smp_wmb(), but
@@ -1130,42 +1117,43 @@ static void collapse_huge_page(struct mm_struct *mm,
* avoid the copy_huge_page writes to become visible after
* the set_pmd_at() write.
*/
- __SetPageUptodate(new_page);
+ __SetPageUptodate(hpage);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ page_add_new_anon_rmap(hpage, vma, address);
+ lru_cache_add_inactive_or_unevictable(hpage, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(pmd_ptl);
- *hpage = NULL;
+ hpage = NULL;
- khugepaged_pages_collapsed++;
result = SCAN_SUCCEED;
out_up_write:
mmap_write_unlock(mm);
out_nolock:
- if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(page_folio(*hpage));
- trace_mm_collapse_huge_page(mm, isolated, result);
- return;
+ if (hpage) {
+ mem_cgroup_uncharge(page_folio(hpage));
+ put_page(hpage);
+ }
+ trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+ return result;
}
-static int khugepaged_scan_pmd(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address,
- struct page **hpage)
+static int hpage_collapse_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, bool *mmap_locked,
+ struct collapse_control *cc)
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, result = 0, referenced = 0;
+ int result = SCAN_FAIL, referenced = 0;
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
unsigned long _address;
@@ -1175,19 +1163,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
- result = SCAN_PMD_NULL;
+ result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ if (result != SCAN_SUCCEED)
goto out;
- }
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ memset(cc->node_load, 0, sizeof(cc->node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
- if (++unmapped <= khugepaged_max_ptes_swap) {
+ ++unmapped;
+ if (!cc->is_khugepaged ||
+ unmapped <= khugepaged_max_ptes_swap) {
/*
* Always be strict with uffd-wp
* enabled swap entries. Please see
@@ -1205,8 +1193,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
}
}
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ ++none_or_zero;
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -1236,27 +1226,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
- if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
- result = SCAN_EXCEED_SHARED_PTE;
- count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
- goto out_unmap;
+ if (page_mapcount(page) > 1) {
+ ++shared;
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ goto out_unmap;
+ }
}
page = compound_head(page);
/*
* Record which node the original page is from and save this
- * information to khugepaged_node_load[].
+ * information to cc->node_load[].
* Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node)) {
+ if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
- khugepaged_node_load[node]++;
+ cc->node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
goto out_unmap;
@@ -1291,31 +1284,38 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
+
+ /*
+ * If collapse was initiated by khugepaged, check that there is
+ * enough young pte to justify collapsing the page
+ */
+ if (cc->is_khugepaged &&
+ (pte_young(pteval) || page_is_young(page) ||
+ PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ address)))
referenced++;
}
if (!writable) {
result = SCAN_PAGE_RO;
- } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+ } else if (cc->is_khugepaged &&
+ (!referenced ||
+ (unmapped && referenced < HPAGE_PMD_NR / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- ret = 1;
}
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
- node = khugepaged_find_target_node();
+ if (result == SCAN_SUCCEED) {
+ result = collapse_huge_page(mm, address, referenced,
+ unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
- collapse_huge_page(mm, address, hpage, node,
- referenced, unmapped);
+ *mmap_locked = false;
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
none_or_zero, result, unmapped);
- return ret;
+ return result;
}
static void collect_mm_slot(struct mm_slot *mm_slot)
@@ -1324,7 +1324,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
lockdep_assert_held(&khugepaged_mm_lock);
- if (khugepaged_test_exit(mm)) {
+ if (hpage_collapse_test_exit(mm)) {
/* free mm_slot */
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
@@ -1402,12 +1402,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
return;
/*
- * This vm_flags may not have VM_HUGEPAGE if the page was not
- * collapsed by this mm. But we can still collapse if the page is
- * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
- * will not fail the vma for missing VM_HUGEPAGE
+ * If we are here, we've succeeded in replacing all the native pages
+ * in the page cache with a single hugepage. If a mm were to fault-in
+ * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
+ * and map it by a PMD, regardless of sysfs THP settings. As such, let's
+ * analogously elide sysfs THP settings here.
*/
- if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false))
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
return;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -1422,8 +1423,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!PageHead(hpage))
goto drop_hpage;
- pmd = mm_find_pmd(mm, haddr);
- if (!pmd)
+ if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED)
goto drop_hpage;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
@@ -1497,7 +1497,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
if (!mmap_write_trylock(mm))
return;
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
goto out;
for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
@@ -1541,8 +1541,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
mm = vma->vm_mm;
- pmd = mm_find_pmd(mm, addr);
- if (!pmd)
+ if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
continue;
/*
* We need exclusive mmap_lock to retract page table.
@@ -1560,7 +1559,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* it'll always mapped in small page size for uffd-wp
* registered ranges.
*/
- if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma))
+ if (!hpage_collapse_test_exit(mm) &&
+ !userfaultfd_wp(vma))
collapse_and_free_pmd(mm, vma, addr, pmd);
mmap_write_unlock(mm);
} else {
@@ -1577,8 +1577,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* @mm: process address space where collapse happens
* @file: file that collapse on
* @start: collapse start address
- * @hpage: new allocated huge page for collapse
- * @node: appointed node the new huge page allocate from
+ * @cc: collapse context and scratchpad
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
@@ -1595,13 +1594,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* + restore gaps in the page cache;
* + unlock and free huge page;
*/
-static void collapse_file(struct mm_struct *mm,
- struct file *file, pgoff_t start,
- struct page **hpage, int node)
+static int collapse_file(struct mm_struct *mm, struct file *file,
+ pgoff_t start, struct collapse_control *cc)
{
struct address_space *mapping = file->f_mapping;
- gfp_t gfp;
- struct page *new_page;
+ struct page *hpage;
pgoff_t index, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
@@ -1612,20 +1609,9 @@ static void collapse_file(struct mm_struct *mm,
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
- /* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
- new_page = khugepaged_alloc_page(hpage, gfp, node);
- if (!new_page) {
- result = SCAN_ALLOC_HUGE_PAGE_FAIL;
- goto out;
- }
-
- if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
- result = SCAN_CGROUP_CHARGE_FAIL;
+ result = alloc_charge_hpage(&hpage, mm, cc);
+ if (result != SCAN_SUCCEED)
goto out;
- }
- count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
/*
* Ensure we have slots for all the pages in the range. This is
@@ -1643,14 +1629,14 @@ static void collapse_file(struct mm_struct *mm,
}
} while (1);
- __SetPageLocked(new_page);
+ __SetPageLocked(hpage);
if (is_shmem)
- __SetPageSwapBacked(new_page);
- new_page->index = start;
- new_page->mapping = mapping;
+ __SetPageSwapBacked(hpage);
+ hpage->index = start;
+ hpage->mapping = mapping;
/*
- * At this point the new_page is locked and not up-to-date.
+ * At this point the hpage is locked and not up-to-date.
* It's safe to insert it into the page cache, because nobody would
* be able to map it or use it in another way until we unlock it.
*/
@@ -1678,7 +1664,7 @@ static void collapse_file(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_locked;
}
- xas_store(&xas, new_page);
+ xas_store(&xas, hpage);
nr_none++;
continue;
}
@@ -1820,19 +1806,19 @@ static void collapse_file(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- xas_store(&xas, new_page);
+ xas_store(&xas, hpage);
continue;
out_unlock:
unlock_page(page);
put_page(page);
goto xa_unlocked;
}
- nr = thp_nr_pages(new_page);
+ nr = thp_nr_pages(hpage);
if (is_shmem)
- __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
+ __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
else {
- __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
+ __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
filemap_nr_thps_inc(mapping);
/*
* Paired with smp_mb() in do_dentry_open() to ensure
@@ -1843,21 +1829,21 @@ out_unlock:
smp_mb();
if (inode_is_open_for_write(mapping->host)) {
result = SCAN_FAIL;
- __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
+ __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
goto xa_locked;
}
}
if (nr_none) {
- __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
+ __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
/* nr_none is always 0 for non-shmem. */
- __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
+ __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
}
/* Join all the small entries into a single multi-index entry */
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
- xas_store(&xas, new_page);
+ xas_store(&xas, hpage);
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
@@ -1879,11 +1865,11 @@ xa_unlocked:
index = start;
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
while (index < page->index) {
- clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}
- copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
- page);
+ copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
+ page);
list_del(&page->lru);
page->mapping = NULL;
page_ref_unfreeze(page, 1);
@@ -1894,23 +1880,22 @@ xa_unlocked:
index++;
}
while (index < end) {
- clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}
- SetPageUptodate(new_page);
- page_ref_add(new_page, HPAGE_PMD_NR - 1);
+ SetPageUptodate(hpage);
+ page_ref_add(hpage, HPAGE_PMD_NR - 1);
if (is_shmem)
- set_page_dirty(new_page);
- lru_cache_add(new_page);
+ set_page_dirty(hpage);
+ lru_cache_add(hpage);
/*
* Remove pte page tables, so we can re-fault the page as huge.
*/
retract_page_tables(mapping, start);
- *hpage = NULL;
-
- khugepaged_pages_collapsed++;
+ unlock_page(hpage);
+ hpage = NULL;
} else {
struct page *page;
@@ -1949,19 +1934,23 @@ xa_unlocked:
VM_BUG_ON(nr_none);
xas_unlock_irq(&xas);
- new_page->mapping = NULL;
+ hpage->mapping = NULL;
}
- unlock_page(new_page);
+ if (hpage)
+ unlock_page(hpage);
out:
VM_BUG_ON(!list_empty(&pagelist));
- if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(page_folio(*hpage));
+ if (hpage) {
+ mem_cgroup_uncharge(page_folio(hpage));
+ put_page(hpage);
+ }
/* TODO: tracepoints */
+ return result;
}
-static void khugepaged_scan_file(struct mm_struct *mm,
- struct file *file, pgoff_t start, struct page **hpage)
+static int khugepaged_scan_file(struct mm_struct *mm, struct file *file,
+ pgoff_t start, struct collapse_control *cc)
{
struct page *page = NULL;
struct address_space *mapping = file->f_mapping;
@@ -1972,14 +1961,16 @@ static void khugepaged_scan_file(struct mm_struct *mm,
present = 0;
swap = 0;
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ memset(cc->node_load, 0, sizeof(cc->node_load));
rcu_read_lock();
xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page)) {
- if (++swap > khugepaged_max_ptes_swap) {
+ ++swap;
+ if (cc->is_khugepaged &&
+ swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
@@ -1997,11 +1988,11 @@ static void khugepaged_scan_file(struct mm_struct *mm,
}
node = page_to_nid(page);
- if (khugepaged_scan_abort(node)) {
+ if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
break;
}
- khugepaged_node_load[node]++;
+ cc->node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
@@ -2030,20 +2021,21 @@ static void khugepaged_scan_file(struct mm_struct *mm,
rcu_read_unlock();
if (result == SCAN_SUCCEED) {
- if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ if (cc->is_khugepaged &&
+ present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
- node = khugepaged_find_target_node();
- collapse_file(mm, file, start, hpage, node);
+ result = collapse_file(mm, file, start, cc);
}
}
/* TODO: tracepoints */
+ return result;
}
#else
-static void khugepaged_scan_file(struct mm_struct *mm,
- struct file *file, pgoff_t start, struct page **hpage)
+static int khugepaged_scan_file(struct mm_struct *mm, struct file *file,
+ pgoff_t start, struct collapse_control *cc)
{
BUILD_BUG();
}
@@ -2053,8 +2045,8 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
}
#endif
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
- struct page **hpage)
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
+ struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
@@ -2065,6 +2057,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
VM_BUG_ON(!pages);
lockdep_assert_held(&khugepaged_mm_lock);
+ *result = SCAN_FAIL;
if (khugepaged_scan.mm_slot)
mm_slot = khugepaged_scan.mm_slot;
@@ -2085,7 +2078,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
vma = NULL;
if (unlikely(!mmap_read_trylock(mm)))
goto breakouterloop_mmap_lock;
- if (likely(!khugepaged_test_exit(mm)))
+ if (likely(!hpage_collapse_test_exit(mm)))
vma = find_vma(mm, khugepaged_scan.address);
progress++;
@@ -2093,11 +2086,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
unsigned long hstart, hend;
cond_resched();
- if (unlikely(khugepaged_test_exit(mm))) {
+ if (unlikely(hpage_collapse_test_exit(mm))) {
progress++;
break;
}
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) {
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
skip:
progress++;
continue;
@@ -2111,9 +2104,10 @@ skip:
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
while (khugepaged_scan.address < hend) {
- int ret;
+ bool mmap_locked = true;
+
cond_resched();
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2125,19 +2119,29 @@ skip:
khugepaged_scan.address);
mmap_read_unlock(mm);
- ret = 1;
- khugepaged_scan_file(mm, file, pgoff, hpage);
+ *result = khugepaged_scan_file(mm, file, pgoff,
+ cc);
+ mmap_locked = false;
fput(file);
} else {
- ret = khugepaged_scan_pmd(mm, vma,
- khugepaged_scan.address,
- hpage);
+ *result = hpage_collapse_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ &mmap_locked,
+ cc);
}
+ if (*result == SCAN_SUCCEED)
+ ++khugepaged_pages_collapsed;
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
- if (ret)
- /* we released mmap_lock so break loop */
+ if (!mmap_locked)
+ /*
+ * We released mmap_lock so break loop. Note
+ * that we drop mmap_lock before all hugepage
+ * allocations, so if allocation fails, we are
+ * guaranteed to break here and report the
+ * correct result back to caller.
+ */
goto breakouterloop_mmap_lock;
if (progress >= pages)
goto breakouterloop;
@@ -2153,7 +2157,7 @@ breakouterloop_mmap_lock:
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
- if (khugepaged_test_exit(mm) || !vma) {
+ if (hpage_collapse_test_exit(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
@@ -2187,19 +2191,16 @@ static int khugepaged_wait_event(void)
kthread_should_stop();
}
-static void khugepaged_do_scan(void)
+static void khugepaged_do_scan(struct collapse_control *cc)
{
- struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
bool wait = true;
+ int result = SCAN_SUCCEED;
lru_add_drain_all();
- while (progress < pages) {
- if (!khugepaged_prealloc_page(&hpage, &wait))
- break;
-
+ while (true) {
cond_resched();
if (unlikely(kthread_should_stop() || try_to_freeze()))
@@ -2211,14 +2212,25 @@ static void khugepaged_do_scan(void)
if (khugepaged_has_work() &&
pass_through_head < 2)
progress += khugepaged_scan_mm_slot(pages - progress,
- &hpage);
+ &result, cc);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
- }
- if (!IS_ERR_OR_NULL(hpage))
- put_page(hpage);
+ if (progress >= pages)
+ break;
+
+ if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
+ /*
+ * If fail to allocate the first time, try to sleep for
+ * a while. When hit again, cancel the scan.
+ */
+ if (!wait)
+ break;
+ wait = false;
+ khugepaged_alloc_sleep();
+ }
+ }
}
static bool khugepaged_should_wakeup(void)
@@ -2255,7 +2267,7 @@ static int khugepaged(void *none)
set_user_nice(current, MAX_NICE);
while (!kthread_should_stop()) {
- khugepaged_do_scan();
+ khugepaged_do_scan(&khugepaged_collapse_control);
khugepaged_wait_work();
}
@@ -2354,3 +2366,120 @@ void khugepaged_min_free_kbytes_update(void)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
+
+static int madvise_collapse_errno(enum scan_result r)
+{
+ /*
+ * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
+ * actionable feedback to caller, so they may take an appropriate
+ * fallback measure depending on the nature of the failure.
+ */
+ switch (r) {
+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ return -ENOMEM;
+ case SCAN_CGROUP_CHARGE_FAIL:
+ return -EBUSY;
+ /* Resource temporary unavailable - trying again might succeed */
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_LRU:
+ return -EAGAIN;
+ /*
+ * Other: Trying again likely not to succeed / error intrinsic to
+ * specified memory range. khugepaged likely won't be able to collapse
+ * either.
+ */
+ default:
+ return -EINVAL;
+ }
+}
+
+int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ struct collapse_control *cc;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long hstart, hend, addr;
+ int thps = 0, last_fail = SCAN_FAIL;
+ bool mmap_locked = true;
+
+ BUG_ON(vma->vm_start > start);
+ BUG_ON(vma->vm_end < end);
+
+ *prev = vma;
+
+ /* TODO: Support file/shmem */
+ if (!vma->anon_vma || !vma_is_anonymous(vma))
+ return -EINVAL;
+
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ return -EINVAL;
+
+ cc = kmalloc(sizeof(*cc), GFP_KERNEL);
+ if (!cc)
+ return -ENOMEM;
+ cc->is_khugepaged = false;
+ cc->last_target_node = NUMA_NO_NODE;
+
+ mmgrab(mm);
+ lru_add_drain_all();
+
+ hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = end & HPAGE_PMD_MASK;
+
+ for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
+ int result = SCAN_FAIL;
+
+ if (!mmap_locked) {
+ cond_resched();
+ mmap_read_lock(mm);
+ mmap_locked = true;
+ result = hugepage_vma_revalidate(mm, addr, &vma, cc);
+ if (result != SCAN_SUCCEED) {
+ last_fail = result;
+ goto out_nolock;
+ }
+ }
+ mmap_assert_locked(mm);
+ memset(cc->node_load, 0, sizeof(cc->node_load));
+ result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked,
+ cc);
+ if (!mmap_locked)
+ *prev = NULL; /* Tell caller we dropped mmap_lock */
+
+ switch (result) {
+ case SCAN_SUCCEED:
+ case SCAN_PMD_MAPPED:
+ ++thps;
+ break;
+ /* Whitelisted set of results where continuing OK */
+ case SCAN_PMD_NULL:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PTE_UFFD_WP:
+ case SCAN_PAGE_RO:
+ case SCAN_LACK_REFERENCED_PAGE:
+ case SCAN_PAGE_NULL:
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_COMPOUND:
+ case SCAN_PAGE_LRU:
+ last_fail = result;
+ break;
+ default:
+ last_fail = result;
+ /* Other error, exit */
+ goto out_maybelock;
+ }
+ }
+
+out_maybelock:
+ /* Caller expects us to hold mmap_lock on return */
+ if (!mmap_locked)
+ mmap_read_lock(mm);
+out_nolock:
+ mmap_assert_locked(mm);
+ mmdrop(mm);
+ kfree(cc);
+
+ return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
+ : madvise_collapse_errno(last_fail);
+}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 1eddc0132f7f..37af2dc8dac9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -604,9 +604,8 @@ static int __save_stack_trace(unsigned long *trace)
* memory block and add it to the object_list and object_tree_root (or
* object_phys_tree_root).
*/
-static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp,
- bool is_phys)
+static void __create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp, bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object, *parent;
@@ -618,7 +617,7 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
- return NULL;
+ return;
}
INIT_LIST_HEAD(&object->object_list);
@@ -687,7 +686,6 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
*/
dump_object_info(parent);
kmem_cache_free(object_cache, object);
- object = NULL;
goto out;
}
}
@@ -698,21 +696,20 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
list_add_tail_rcu(&object->object_list, &object_list);
out:
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
- return object;
}
/* Create kmemleak object which allocated with virtual address. */
-static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp)
+static void create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
{
- return __create_object(ptr, size, min_count, gfp, false);
+ __create_object(ptr, size, min_count, gfp, false);
}
/* Create kmemleak object which allocated with physical address. */
-static struct kmemleak_object *create_object_phys(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp)
+static void create_object_phys(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
{
- return __create_object(ptr, size, min_count, gfp, true);
+ __create_object(ptr, size, min_count, gfp, true);
}
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 42ab153335a2..fd6d03cb0463 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1095,6 +1095,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
goto out_unlock;
}
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
@@ -1134,6 +1135,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
{
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
+ pmd_t pmde;
pte_t *ptep;
pte_t newpte;
spinlock_t *ptl;
@@ -1148,6 +1150,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
+ /*
+ * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
+ * without holding anon_vma lock for write. So when looking for a
+ * genuine pmde (in which to find pte), test present and !THP together.
+ */
+ pmde = *pmd;
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ goto out;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
diff --git a/mm/madvise.c b/mm/madvise.c
index 9ff51650f4f0..4f86eb7f554d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -1060,6 +1061,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
if (error)
goto out;
break;
+ case MADV_COLLAPSE:
+ return madvise_collapse(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1153,6 +1156,7 @@ madvise_behavior_valid(int behavior)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
+ case MADV_COLLAPSE:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
@@ -1169,13 +1173,13 @@ madvise_behavior_valid(int behavior)
}
}
-static bool
-process_madvise_behavior_valid(int behavior)
+static bool process_madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_WILLNEED:
+ case MADV_COLLAPSE:
return true;
default:
return false;
@@ -1342,6 +1346,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
* transparent huge pages so the existing pages will not be
* coalesced into THP and new pages will not be allocated as THP.
+ * MADV_COLLAPSE - synchronously coalesce pages into new THP.
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b69979c9ced5..403af5f7a2b9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1143,7 +1143,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
} while ((memcg = parent_mem_cgroup(memcg)));
/*
- * When cgruop1 non-hierarchy mode is used,
+ * When cgroup1 non-hierarchy mode is used,
* parent_mem_cgroup() does not walk all the way up to the
* cgroup root (root_mem_cgroup). So we have to handle
* dead_memcg from cgroup root separately.
@@ -3975,6 +3975,8 @@ static const unsigned int memcg1_stats[] = {
NR_FILE_MAPPED,
NR_FILE_DIRTY,
NR_WRITEBACK,
+ WORKINGSET_REFAULT_ANON,
+ WORKINGSET_REFAULT_FILE,
MEMCG_SWAP,
};
@@ -3988,6 +3990,8 @@ static const char *const memcg1_stat_names[] = {
"mapped_file",
"dirty",
"writeback",
+ "workingset_refault_anon",
+ "workingset_refault_file",
"swap",
};
@@ -4016,7 +4020,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+ nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -4047,7 +4052,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)nr * PAGE_SIZE);
+ (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e7ac570dda75..265378237c22 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -413,7 +413,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
{
struct to_kill *tk, *next;
- list_for_each_entry_safe (tk, next, to_kill, nd) {
+ list_for_each_entry_safe(tk, next, to_kill, nd) {
if (forcekill) {
/*
* In case something went wrong with munmapping
@@ -437,6 +437,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
+ list_del(&tk->nd);
put_task_struct(tk->tsk);
kfree(tk);
}
@@ -1401,7 +1402,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
- int kill = 1, forcekill;
+ int forcekill;
bool mlocked = PageMlocked(hpage);
/*
@@ -1442,7 +1443,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
- kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
@@ -1453,12 +1453,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
- *
- * Error handling: We ignore errors here because
- * there's nothing that can be done.
*/
- if (kill)
- collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
if (PageHuge(hpage) && !PageAnon(hpage)) {
/*
@@ -1500,7 +1496,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
+ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
+ !unmap_success;
kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
return unmap_success;
@@ -1529,20 +1526,18 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}
-static int try_to_split_thp_page(struct page *page, const char *msg)
+static int try_to_split_thp_page(struct page *page)
{
+ int ret;
+
lock_page(page);
- if (unlikely(split_huge_page(page))) {
- unsigned long pfn = page_to_pfn(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
- unlock_page(page);
- pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ if (unlikely(ret))
put_page(page);
- return -EBUSY;
- }
- unlock_page(page);
- return 0;
+ return ret;
}
static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
@@ -1867,8 +1862,10 @@ retry:
if (hwpoison_filter(p)) {
hugetlb_clear_page_hwpoison(head);
- res = -EOPNOTSUPP;
- goto out;
+ unlock_page(head);
+ if (res == 1)
+ put_page(head);
+ return -EOPNOTSUPP;
}
/*
@@ -2084,7 +2081,7 @@ try_again:
* page is a valid handlable page.
*/
SetPageHasHWPoisoned(hpage);
- if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+ if (try_to_split_thp_page(p) < 0) {
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
res = -EBUSY;
goto unlock_mutex;
@@ -2359,7 +2356,7 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
- if (PageSlab(page) || PageTable(page))
+ if (PageSlab(page) || PageTable(page) || PageReserved(page))
goto unlock_mutex;
ret = get_hwpoison_page(p, MF_UNPOISON);
@@ -2383,13 +2380,14 @@ int unpoison_memory(unsigned long pfn)
count = free_raw_hwp_pages(page, false);
if (count == 0) {
ret = -EBUSY;
+ put_page(page);
goto unlock_mutex;
}
}
freeit = !!TestClearPageHWPoison(p);
put_page(page);
- if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
+ if (freeit) {
put_page(page);
ret = 0;
}
@@ -2439,11 +2437,11 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
}
/*
- * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
+ * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
* If the page is a non-dirty unmapped page-cache page, it simply invalidates.
* If the page is mapped, it migrates the contents over.
*/
-static int __soft_offline_page(struct page *page)
+static int soft_offline_in_use_page(struct page *page)
{
long ret = 0;
unsigned long pfn = page_to_pfn(page);
@@ -2456,6 +2454,14 @@ static int __soft_offline_page(struct page *page)
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
+ if (!huge && PageTransHuge(hpage)) {
+ if (try_to_split_thp_page(page)) {
+ pr_info("soft offline: %#lx: thp split failed\n", pfn);
+ return -EBUSY;
+ }
+ hpage = page;
+ }
+
lock_page(page);
if (!PageHuge(page))
wait_on_page_writeback(page);
@@ -2505,26 +2511,6 @@ static int __soft_offline_page(struct page *page)
return ret;
}
-static int soft_offline_in_use_page(struct page *page)
-{
- struct page *hpage = compound_head(page);
-
- if (!PageHuge(page) && PageTransHuge(hpage))
- if (try_to_split_thp_page(page, "soft offline") < 0)
- return -EBUSY;
- return __soft_offline_page(page);
-}
-
-static int soft_offline_free_page(struct page *page)
-{
- int rc = 0;
-
- if (!page_handle_poison(page, true, false))
- rc = -EBUSY;
-
- return rc;
-}
-
static void put_ref_page(struct page *page)
{
if (page)
@@ -2592,8 +2578,6 @@ retry:
if (hwpoison_filter(page)) {
if (ret > 0)
put_page(page);
- else
- put_ref_page(ref_page);
mutex_unlock(&mf_mutex);
return -EOPNOTSUPP;
@@ -2602,7 +2586,7 @@ retry:
if (ret > 0) {
ret = soft_offline_in_use_page(page);
} else if (ret == 0) {
- if (soft_offline_free_page(page) && try_again) {
+ if (!page_handle_poison(page, true, false) && try_again) {
try_again = false;
flags &= ~MF_COUNT_INCREASED;
goto retry;
diff --git a/mm/memory.c b/mm/memory.c
index a78814413ac0..e38f9245470c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -74,6 +74,7 @@
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
+#include <linux/sched/sysctl.h>
#include <trace/events/kmem.h>
@@ -4731,8 +4732,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
- last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(page_nid))
+ last_cpupid = (-1 & LAST_CPUPID_MASK);
+ else
+ last_cpupid = page_cpupid_last(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
if (target_nid == NUMA_NO_NODE) {
@@ -4991,7 +5000,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- hugepage_vma_check(vma, vm_flags, false, true)) {
+ hugepage_vma_check(vma, vm_flags, false, true, true)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -5025,7 +5034,7 @@ retry_pud:
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- hugepage_vma_check(vma, vm_flags, false, true)) {
+ hugepage_vma_check(vma, vm_flags, false, true, true)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fad6d1f2262a..9ae1f98548b1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1940,8 +1940,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
node_states_clear_node(node, &arg);
if (arg.status_change_nid >= 0) {
- kswapd_stop(node);
kcompactd_stop(node);
+ kswapd_stop(node);
}
writeback_set_ratelimit();
@@ -1969,11 +1969,10 @@ failed_removal:
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
- int ret = !is_memblock_offlined(mem);
int *nid = arg;
*nid = mem->nid;
- if (unlikely(ret)) {
+ if (unlikely(mem->state != MEM_OFFLINE)) {
phys_addr_t beginpa, endpa;
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b73d3248d976..a88fd94e18d6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -853,12 +853,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
+ task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
+ task_unlock(current);
mpol_put(new);
goto out;
}
- task_lock(current);
+
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -1803,7 +1805,7 @@ bool vma_policy_mof(struct vm_area_struct *vma)
return pol->flags & MPOL_F_MOF;
}
-static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
diff --git a/mm/memremap.c b/mm/memremap.c
index 58b20c3c300b..25029a474d30 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -454,7 +454,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
/* fall back to slow path lookup */
rcu_read_lock();
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
- if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
+ if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a1597c92261..ce6a58f3b21f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -560,6 +560,18 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
* future migrations of this same page.
*/
cpupid = page_cpupid_xchg_last(&folio->page, -1);
+ /*
+ * For memory tiering mode, when migrate between slow and fast
+ * memory node, reset cpupid, because that is used to record
+ * page access time in slow memory node.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
+ bool f_toptier = node_is_toptier(page_to_nid(&folio->page));
+ bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page));
+
+ if (f_toptier != t_toptier)
+ cpupid = -1;
+ }
page_cpupid_xchg_last(&newfolio->page, cpupid);
folio_migrate_ksm(newfolio, folio);
@@ -1848,6 +1860,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
for (i = 0; i < nr_pages; i++) {
unsigned long addr = (unsigned long)(*pages);
+ unsigned int foll_flags = FOLL_DUMP;
struct vm_area_struct *vma;
struct page *page;
int err = -EFAULT;
@@ -1856,8 +1869,12 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
if (!vma)
goto set_status;
+ /* Not all huge page follow APIs support 'FOLL_GET' */
+ if (!is_vm_hugetlb_page(vma))
+ foll_flags |= FOLL_GET;
+
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+ page = follow_page(vma, addr, foll_flags);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1865,7 +1882,8 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
if (page && !is_zone_device_page(page)) {
err = page_to_nid(page);
- put_page(page);
+ if (foll_flags & FOLL_GET)
+ put_page(page);
} else {
err = -ENOENT;
}
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index dbf6c7a7a7c9..d8efd5a0eb40 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -186,9 +186,16 @@ again:
get_page(page);
/*
- * Optimize for the common case where page is only mapped once
- * in one process. If we can lock the page, then we can safely
- * set up a special migration page table entry now.
+ * We rely on trylock_page() to avoid deadlock between
+ * concurrent migrations where each is waiting on the others
+ * page lock. If we can't immediately lock the page we fail this
+ * migration as it is only best effort anyway.
+ *
+ * If we can lock the page it's safe to set up a migration entry
+ * now. In the common case where the page is mapped once in a
+ * single process setting up the migration entry now is an
+ * optimisation to avoid walking the rmap later with
+ * try_to_migrate().
*/
if (trylock_page(page)) {
bool anon_exclusive;
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d780f415be3..dd25a2aa94f7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2232,6 +2232,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ /* Ensures that larger anonymous mappings are THP aligned. */
+ get_area = thp_get_unmapped_area;
}
addr = get_area(file, addr, len, pgoff, flags);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bc6bddd156ca..ed013f836b4a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -121,6 +121,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
if (prot_numa) {
struct page *page;
int nid;
+ bool toptier;
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
@@ -150,14 +151,19 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
nid = page_to_nid(page);
if (target_node == nid)
continue;
+ toptier = node_is_toptier(nid);
/*
* Skip scanning top tier node if normal numa
* balancing is disabled
*/
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- node_is_toptier(nid))
+ toptier)
continue;
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !toptier)
+ xchg_page_access_time(page,
+ jiffies_to_msecs(jiffies));
}
oldpte = ptep_modify_prot_start(vma, addr, pte);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d04211f0ef0b..262896bd1a90 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -482,6 +482,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
static unsigned long prev_end_pfn, nr_initialised;
+ if (early_page_ext_enabled())
+ return false;
/*
* prev_end_pfn static that contains the end of previous zone
* No need to protect because called very early in boot before smp_init.
@@ -3010,7 +3012,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
* i.e. orders < pageblock_order. If there are no local zones free,
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
*/
- if (alloc_flags & ALLOC_NOFRAGMENT)
+ if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
min_order = pageblock_order;
/*
@@ -3598,16 +3600,11 @@ EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
- unsigned long watermark;
- struct zone *zone;
- int mt;
-
- BUG_ON(!PageBuddy(page));
-
- zone = page_zone(page);
- mt = get_pageblock_migratetype(page);
+ struct zone *zone = page_zone(page);
+ int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
+ unsigned long watermark;
/*
* Obey watermarks as if the page was being allocated. We can
* emulate a high-order watermark check with a raised order-0
@@ -3621,8 +3618,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
__mod_zone_freepage_state(zone, -(1UL << order), mt);
}
- /* Remove page from free list */
-
del_page_from_free_list(page, zone, order);
/*
@@ -3643,7 +3638,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
}
-
return 1UL << order;
}
@@ -3777,8 +3771,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype,
- unsigned int alloc_flags)
+ int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -3839,7 +3832,7 @@ struct page *rmqueue(struct zone *preferred_zone,
if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
migratetype != MIGRATE_MOVABLE) {
page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype, alloc_flags);
+ migratetype, alloc_flags);
if (likely(page))
goto out;
}
@@ -7659,6 +7652,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
int i;
pgdat_resize_init(pgdat);
+ pgdat_kswapd_lock_init(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
@@ -7953,17 +7947,6 @@ unsigned long __init node_map_pfn_alignment(void)
return ~accl_mask + 1;
}
-/**
- * find_min_pfn_with_active_regions - Find the minimum PFN registered
- *
- * Return: the minimum PFN based on information provided via
- * memblock_set_node().
- */
-unsigned long __init find_min_pfn_with_active_regions(void)
-{
- return PHYS_PFN(memblock_start_of_DRAM());
-}
-
/*
* early_calculate_totalpages()
* Sum pages in active regions for movable zone.
@@ -8256,7 +8239,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
- start_pfn = find_min_pfn_with_active_regions();
+ start_pfn = PHYS_PFN(memblock_start_of_DRAM());
descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -9019,7 +9002,7 @@ void *__init alloc_large_system_hash(const char *tablename,
{
unsigned long long max = high_limit;
unsigned long log2qty, size;
- void *table = NULL;
+ void *table;
gfp_t gfp_flags;
bool virt;
bool huge;
diff --git a/mm/page_counter.c b/mm/page_counter.c
index eb156ff5d603..db20d6452b71 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -17,24 +17,23 @@ static void propagate_protected_usage(struct page_counter *c,
unsigned long usage)
{
unsigned long protected, old_protected;
- unsigned long low, min;
long delta;
if (!c->parent)
return;
- min = READ_ONCE(c->min);
- if (min || atomic_long_read(&c->min_usage)) {
- protected = min(usage, min);
+ protected = min(usage, READ_ONCE(c->min));
+ old_protected = atomic_long_read(&c->min_usage);
+ if (protected != old_protected) {
old_protected = atomic_long_xchg(&c->min_usage, protected);
delta = protected - old_protected;
if (delta)
atomic_long_add(delta, &c->parent->children_min_usage);
}
- low = READ_ONCE(c->low);
- if (low || atomic_long_read(&c->low_usage)) {
- protected = min(usage, low);
+ protected = min(usage, READ_ONCE(c->low));
+ old_protected = atomic_long_read(&c->low_usage);
+ if (protected != old_protected) {
old_protected = atomic_long_xchg(&c->low_usage, protected);
delta = protected - old_protected;
if (delta)
@@ -193,7 +192,7 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
old = xchg(&counter->max, nr_pages);
- if (page_counter_read(counter) <= usage)
+ if (page_counter_read(counter) <= usage || nr_pages >= old)
return 0;
counter->max = old;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 3dc715d7ac29..affe80243b6d 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -9,6 +9,7 @@
#include <linux/page_owner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
+#include <linux/rcupdate.h>
/*
* struct page extension
@@ -59,6 +60,10 @@
* can utilize this callback to initialize the state of it correctly.
*/
+#ifdef CONFIG_SPARSEMEM
+#define PAGE_EXT_INVALID (0x1)
+#endif
+
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
static bool need_page_idle(void)
{
@@ -84,6 +89,15 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
unsigned long page_ext_size = sizeof(struct page_ext);
static unsigned long total_usage;
+static struct page_ext *lookup_page_ext(const struct page *page);
+
+bool early_page_ext;
+static int __init setup_early_page_ext(char *str)
+{
+ early_page_ext = true;
+ return 0;
+}
+early_param("early_page_ext", setup_early_page_ext);
static bool __init invoke_need_callbacks(void)
{
@@ -125,6 +139,48 @@ static inline struct page_ext *get_entry(void *base, unsigned long index)
return base + page_ext_size * index;
}
+/**
+ * page_ext_get() - Get the extended information for a page.
+ * @page: The page we're interested in.
+ *
+ * Ensures that the page_ext will remain valid until page_ext_put()
+ * is called.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ * Context: Any context. Caller may not sleep until they have called
+ * page_ext_put().
+ */
+struct page_ext *page_ext_get(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ rcu_read_lock();
+ page_ext = lookup_page_ext(page);
+ if (!page_ext) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ return page_ext;
+}
+
+/**
+ * page_ext_put() - Working with page extended information is done.
+ * @page_ext - Page extended information received from page_ext_get().
+ *
+ * The page extended information of the page may not be valid after this
+ * function is called.
+ *
+ * Return: None.
+ * Context: Any context with corresponding page_ext_get() is called.
+ */
+void page_ext_put(struct page_ext *page_ext)
+{
+ if (unlikely(!page_ext))
+ return;
+
+ rcu_read_unlock();
+}
#ifndef CONFIG_SPARSEMEM
@@ -133,12 +189,13 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
pgdat->node_page_ext = NULL;
}
-struct page_ext *lookup_page_ext(const struct page *page)
+static struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long index;
struct page_ext *base;
+ WARN_ON_ONCE(!rcu_read_lock_held());
base = NODE_DATA(page_to_nid(page))->node_page_ext;
/*
* The sanity checks the page allocator does upon freeing a
@@ -206,20 +263,27 @@ fail:
}
#else /* CONFIG_SPARSEMEM */
+static bool page_ext_invalid(struct page_ext *page_ext)
+{
+ return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
+}
-struct page_ext *lookup_page_ext(const struct page *page)
+static struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
+ struct page_ext *page_ext = READ_ONCE(section->page_ext);
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
*/
- if (!section->page_ext)
+ if (page_ext_invalid(page_ext))
return NULL;
- return get_entry(section->page_ext, pfn);
+ return get_entry(page_ext, pfn);
}
static void *__meminit alloc_page_ext(size_t size, int nid)
@@ -298,9 +362,30 @@ static void __free_page_ext(unsigned long pfn)
ms = __pfn_to_section(pfn);
if (!ms || !ms->page_ext)
return;
- base = get_entry(ms->page_ext, pfn);
+
+ base = READ_ONCE(ms->page_ext);
+ /*
+ * page_ext here can be valid while doing the roll back
+ * operation in online_page_ext().
+ */
+ if (page_ext_invalid(base))
+ base = (void *)base - PAGE_EXT_INVALID;
+ WRITE_ONCE(ms->page_ext, NULL);
+
+ base = get_entry(base, pfn);
free_page_ext(base);
- ms->page_ext = NULL;
+}
+
+static void __invalidate_page_ext(unsigned long pfn)
+{
+ struct mem_section *ms;
+ void *val;
+
+ ms = __pfn_to_section(pfn);
+ if (!ms || !ms->page_ext)
+ return;
+ val = (void *)ms->page_ext + PAGE_EXT_INVALID;
+ WRITE_ONCE(ms->page_ext, val);
}
static int __meminit online_page_ext(unsigned long start_pfn,
@@ -336,13 +421,27 @@ static int __meminit online_page_ext(unsigned long start_pfn,
}
static int __meminit offline_page_ext(unsigned long start_pfn,
- unsigned long nr_pages, int nid)
+ unsigned long nr_pages)
{
unsigned long start, end, pfn;
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+ /*
+ * Freeing of page_ext is done in 3 steps to avoid
+ * use-after-free of it:
+ * 1) Traverse all the sections and mark their page_ext
+ * as invalid.
+ * 2) Wait for all the existing users of page_ext who
+ * started before invalidation to finish.
+ * 3) Free the page_ext.
+ */
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __invalidate_page_ext(pfn);
+
+ synchronize_rcu();
+
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_ext(pfn);
return 0;
@@ -362,11 +461,11 @@ static int __meminit page_ext_callback(struct notifier_block *self,
break;
case MEM_OFFLINE:
offline_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ mn->nr_pages);
break;
case MEM_CANCEL_ONLINE:
offline_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ mn->nr_pages);
break;
case MEM_GOING_OFFLINE:
break;
diff --git a/mm/page_io.c b/mm/page_io.c
index 68318134dc92..68d53fc27598 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -28,7 +28,7 @@
#include <linux/delayacct.h>
#include "swap.h"
-void end_swap_bio_write(struct bio *bio)
+static void end_swap_bio_write(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -202,7 +202,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
goto out;
}
- ret = __swap_writepage(page, wbc, end_swap_bio_write);
+ ret = __swap_writepage(page, wbc);
out:
return ret;
}
@@ -332,8 +332,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
return 0;
}
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func)
+int __swap_writepage(struct page *page, struct writeback_control *wbc)
{
struct bio *bio;
int ret;
@@ -358,7 +357,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
GFP_NOIO);
bio->bi_iter.bi_sector = swap_page_sector(page);
- bio->bi_end_io = end_write_func;
+ bio->bi_end_io = end_swap_bio_write;
bio_add_page(bio, page, thp_size(page), 0);
bio_associate_blkg_from_page(bio, page);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index e4c6f3f1695b..90023f938c19 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -141,7 +141,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
struct page_owner *page_owner;
u64 free_ts_nsec = local_clock();
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
@@ -153,6 +153,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
page_owner->free_ts_nsec = free_ts_nsec;
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
static inline void __set_page_owner_handle(struct page_ext *page_ext,
@@ -183,19 +184,21 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext;
depot_stack_handle_t handle;
+ handle = save_stack(gfp_mask);
+
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
-
- handle = save_stack(gfp_mask);
__set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ page_ext_put(page_ext);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get(page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
@@ -203,12 +206,13 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_owner = get_page_owner(page_ext);
page_owner->last_migrate_reason = reason;
+ page_ext_put(page_ext);
}
void __split_page_owner(struct page *page, unsigned int nr)
{
int i;
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get(page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
@@ -219,16 +223,24 @@ void __split_page_owner(struct page *page, unsigned int nr)
page_owner->order = 0;
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
- struct page_ext *old_ext = lookup_page_ext(&old->page);
- struct page_ext *new_ext = lookup_page_ext(&newfolio->page);
+ struct page_ext *old_ext;
+ struct page_ext *new_ext;
struct page_owner *old_page_owner, *new_page_owner;
- if (unlikely(!old_ext || !new_ext))
+ old_ext = page_ext_get(&old->page);
+ if (unlikely(!old_ext))
+ return;
+
+ new_ext = page_ext_get(&newfolio->page);
+ if (unlikely(!new_ext)) {
+ page_ext_put(old_ext);
return;
+ }
old_page_owner = get_page_owner(old_ext);
new_page_owner = get_page_owner(new_ext);
@@ -254,6 +266,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
*/
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
+ page_ext_put(new_ext);
+ page_ext_put(old_ext);
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -307,12 +321,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (PageReserved(page))
continue;
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
- continue;
+ goto ext_put_continue;
page_owner = get_page_owner(page_ext);
page_mt = gfp_migratetype(page_owner->gfp_mask);
@@ -323,9 +337,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
count[pageblock_mt]++;
pfn = block_end_pfn;
+ page_ext_put(page_ext);
break;
}
pfn += (1UL << page_owner->order) - 1;
+ext_put_continue:
+ page_ext_put(page_ext);
}
}
@@ -435,7 +452,7 @@ err:
void __dump_page_owner(const struct page *page)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get((void *)page);
struct page_owner *page_owner;
depot_stack_handle_t handle;
gfp_t gfp_mask;
@@ -452,6 +469,7 @@ void __dump_page_owner(const struct page *page)
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
pr_alert("page_owner info is not present (never set?)\n");
+ page_ext_put(page_ext);
return;
}
@@ -482,6 +500,7 @@ void __dump_page_owner(const struct page *page)
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
migrate_reason_names[page_owner->last_migrate_reason]);
+ page_ext_put(page_ext);
}
static ssize_t
@@ -497,8 +516,10 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return -EINVAL;
page = NULL;
- pfn = min_low_pfn + *ppos;
-
+ if (*ppos == 0)
+ pfn = min_low_pfn;
+ else
+ pfn = *ppos;
/* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
pfn++;
@@ -508,6 +529,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
/* Find an allocated page */
for (; pfn < max_pfn; pfn++) {
/*
+ * This temporary page_owner is required so
+ * that we can avoid the context switches while holding
+ * the rcu lock and copying the page owner information to
+ * user through copy_to_user() or GFP_KERNEL allocations.
+ */
+ struct page_owner page_owner_tmp;
+
+ /*
* If the new page is in a new MAX_ORDER_NR_PAGES area,
* validate the area as existing, skip it if not
*/
@@ -525,7 +554,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
continue;
}
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
@@ -534,14 +563,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
* because we don't hold the zone lock.
*/
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
- continue;
+ goto ext_put_continue;
/*
* Although we do have the info about past allocation of free
* pages, it's not relevant for current memory usage.
*/
if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
- continue;
+ goto ext_put_continue;
page_owner = get_page_owner(page_ext);
@@ -550,7 +579,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
* would inflate the stats.
*/
if (!IS_ALIGNED(pfn, 1 << page_owner->order))
- continue;
+ goto ext_put_continue;
/*
* Access to page_ext->handle isn't synchronous so we should
@@ -558,18 +587,37 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
*/
handle = READ_ONCE(page_owner->handle);
if (!handle)
- continue;
+ goto ext_put_continue;
/* Record the next PFN to read in the file offset */
- *ppos = (pfn - min_low_pfn) + 1;
+ *ppos = pfn + 1;
+ page_owner_tmp = *page_owner;
+ page_ext_put(page_ext);
return print_page_owner(buf, count, pfn, page,
- page_owner, handle);
+ &page_owner_tmp, handle);
+ext_put_continue:
+ page_ext_put(page_ext);
}
return 0;
}
+static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
+{
+ switch (orig) {
+ case SEEK_SET:
+ file->f_pos = offset;
+ break;
+ case SEEK_CUR:
+ file->f_pos += offset;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return file->f_pos;
+}
+
static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
{
unsigned long pfn = zone->zone_start_pfn;
@@ -617,18 +665,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
if (PageReserved(page))
continue;
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
/* Maybe overlapping zone */
if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
- continue;
+ goto ext_put_continue;
/* Found early allocated page */
__set_page_owner_handle(page_ext, early_handle,
0, 0);
count++;
+ext_put_continue:
+ page_ext_put(page_ext);
}
cond_resched();
}
@@ -660,6 +710,7 @@ static void init_early_allocated_pages(void)
static const struct file_operations proc_page_owner_operations = {
.read = read_page_owner,
+ .llseek = lseek_page_owner,
};
static int __init pageowner_init(void)
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index e2062748791a..903db62794d3 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -68,7 +68,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
return;
page = pfn_to_page(pfn);
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -83,6 +83,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
}
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
/*
@@ -103,7 +104,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
return;
page = pfn_to_page(pfn);
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -118,6 +119,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
}
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
/*
@@ -126,9 +128,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
*/
void __page_table_check_zero(struct page *page, unsigned int order)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext;
unsigned long i;
+ page_ext = page_ext_get(page);
BUG_ON(!page_ext);
for (i = 0; i < (1ul << order); i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
@@ -137,6 +140,7 @@ void __page_table_check_zero(struct page *page, unsigned int order)
BUG_ON(atomic_read(&ptc->file_map_count));
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 9b3db11a4d1d..908ec1577f40 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -479,7 +479,15 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
return err;
}
-/*
+/**
+ * walk_page_range_novma - walk a range of pagetables not backed by a vma
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @pgd: pgd to walk if different from mm->pgd
+ * @private: private data for callbacks' usage
+ *
* Similar to walk_page_range() but can walk any page tables even if they are
* not backed by VMAs. Because 'unusual' entries may be walked this function
* will also not lock the PTEs for the pte_entry() callback. This is useful for
diff --git a/mm/rmap.c b/mm/rmap.c
index edc06c52bc82..6781f693df50 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -767,13 +767,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return vma_address(page, vma);
}
+/*
+ * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
+ * NULL if it doesn't exist. No guarantees / checks on what the pmd_t*
+ * represents.
+ */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd = NULL;
- pmd_t pmde;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -788,15 +792,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
goto out;
pmd = pmd_offset(pud, address);
- /*
- * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
- * without holding anon_vma lock for write. So when looking for a
- * genuine pmde (in which to find pte), test present and !THP together.
- */
- pmde = *pmd;
- barrier();
- if (!pmd_present(pmde) || pmd_trans_huge(pmde))
- pmd = NULL;
out:
return pmd;
}
@@ -1579,11 +1574,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- /*
- * Nuke the page table entry. When having to clear
- * PageAnonExclusive(), we always have to flush.
- */
- if (should_defer_flush(mm, flags) && !anon_exclusive) {
+ /* Nuke the page table entry. */
+ if (should_defer_flush(mm, flags)) {
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
@@ -1714,6 +1706,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
swap_free(entry);
@@ -2045,6 +2039,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
}
VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
!anon_exclusive, subpage);
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
if (folio_test_hugetlb(folio))
diff --git a/mm/slub.c b/mm/slub.c
index 862dbd9af4f5..6953c3367bc2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5745,6 +5745,29 @@ STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
#endif /* CONFIG_SLUB_STATS */
+#ifdef CONFIG_KFENCE
+static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
+}
+
+static ssize_t skip_kfence_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ int ret = length;
+
+ if (buf[0] == '0')
+ s->flags &= ~SLAB_SKIP_KFENCE;
+ else if (buf[0] == '1')
+ s->flags |= SLAB_SKIP_KFENCE;
+ else
+ ret = -EINVAL;
+
+ return ret;
+}
+SLAB_ATTR(skip_kfence);
+#endif
+
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
&object_size_attr.attr,
@@ -5812,6 +5835,9 @@ static struct attribute *slab_attrs[] = {
&failslab_attr.attr,
#endif
&usersize_attr.attr,
+#ifdef CONFIG_KFENCE
+ &skip_kfence_attr.attr,
+#endif
NULL
};
diff --git a/mm/swap.h b/mm/swap.h
index 17936e068c1c..0ffa5b478051 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -18,9 +18,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
}
void swap_write_unplug(struct swap_iocb *sio);
int swap_writepage(struct page *page, struct writeback_control *wbc);
-void end_swap_bio_write(struct bio *bio);
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func);
+int __swap_writepage(struct page *page, struct writeback_control *wbc);
/* linux/mm/swap_state.c */
/* One swap address space for each 64M swap space */
diff --git a/mm/util.c b/mm/util.c
index c9439c66d8cf..8d944ce71e94 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -850,10 +850,10 @@ int folio_mapcount(struct folio *folio)
return atomic_read(&folio->_mapcount) + 1;
compound = folio_entire_mapcount(folio);
- nr = folio_nr_pages(folio);
if (folio_test_hugetlb(folio))
return compound;
ret = compound;
+ nr = folio_nr_pages(folio);
for (i = 0; i < nr; i++)
ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
@@ -1052,6 +1052,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
+ pr_warn_ratelimited("%s: pid: %d, comm: %s, no enough memory for the allocation\n",
+ __func__, current->pid, current->comm);
vm_unacct_memory(pages);
return -ENOMEM;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index dd6cdb201195..a991b909866f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -590,7 +590,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
int err;
err = vmap_range_noflush(addr, addr + (1UL << page_shift),
- __pa(page_address(pages[i])), prot,
+ page_to_phys(pages[i]), prot,
page_shift);
if (err)
return err;
@@ -1300,12 +1300,12 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size,
#include <linux/random.h>
static struct vmap_area *
-find_vmap_lowest_linear_match(unsigned long size,
+find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
unsigned long align, unsigned long vstart)
{
struct vmap_area *va;
- list_for_each_entry(va, &free_vmap_area_list, list) {
+ list_for_each_entry(va, head, list) {
if (!is_within_this_va(va, size, align, vstart))
continue;
@@ -1316,7 +1316,8 @@ find_vmap_lowest_linear_match(unsigned long size,
}
static void
-find_vmap_lowest_match_check(unsigned long size, unsigned long align)
+find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align)
{
struct vmap_area *va_1, *va_2;
unsigned long vstart;
@@ -1325,8 +1326,8 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
- va_1 = find_vmap_lowest_match(size, align, vstart, false);
- va_2 = find_vmap_lowest_linear_match(size, align, vstart);
+ va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
+ va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
if (va_1 != va_2)
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -1513,7 +1514,7 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(size, align);
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 382dbe97329f..c879694b41fe 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3228,22 +3228,22 @@ again:
if (!sc->force_deactivate) {
unsigned long refaults;
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
refaults = lruvec_page_state(target_lruvec,
WORKINGSET_ACTIVATE_ANON);
- if (refaults != target_lruvec->refaults[0] ||
+ if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
sc->may_deactivate |= DEACTIVATE_ANON;
else
sc->may_deactivate &= ~DEACTIVATE_ANON;
- /*
- * When refaults are being observed, it means a new
- * workingset is being established. Deactivate to get
- * rid of any stale active pages quickly.
- */
refaults = lruvec_page_state(target_lruvec,
WORKINGSET_ACTIVATE_FILE);
- if (refaults != target_lruvec->refaults[1] ||
+ if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
sc->may_deactivate |= DEACTIVATE_FILE;
else
@@ -3559,9 +3559,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
- target_lruvec->refaults[0] = refaults;
+ target_lruvec->refaults[WORKINGSET_ANON] = refaults;
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
- target_lruvec->refaults[1] = refaults;
+ target_lruvec->refaults[WORKINGSET_FILE] = refaults;
}
/*
@@ -4643,16 +4643,17 @@ void kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- if (pgdat->kswapd)
- return;
-
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
- if (IS_ERR(pgdat->kswapd)) {
- /* failure at boot is fatal */
- BUG_ON(system_state < SYSTEM_RUNNING);
- pr_err("Failed to start kswapd on node %d\n", nid);
- pgdat->kswapd = NULL;
+ pgdat_kswapd_lock(pgdat);
+ if (!pgdat->kswapd) {
+ pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ if (IS_ERR(pgdat->kswapd)) {
+ /* failure at boot is fatal */
+ BUG_ON(system_state < SYSTEM_RUNNING);
+ pr_err("Failed to start kswapd on node %d\n", nid);
+ pgdat->kswapd = NULL;
+ }
}
+ pgdat_kswapd_unlock(pgdat);
}
/*
@@ -4661,12 +4662,16 @@ void kswapd_run(int nid)
*/
void kswapd_stop(int nid)
{
- struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct task_struct *kswapd;
+ pgdat_kswapd_lock(pgdat);
+ kswapd = pgdat->kswapd;
if (kswapd) {
kthread_stop(kswapd);
- NODE_DATA(nid)->kswapd = NULL;
+ pgdat->kswapd = NULL;
}
+ pgdat_kswapd_unlock(pgdat);
}
static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 90af9a8572f5..c109167a669c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1252,6 +1252,7 @@ const char * const vmstat_text[] = {
#endif
#ifdef CONFIG_NUMA_BALANCING
"pgpromote_success",
+ "pgpromote_candidate",
#endif
/* enum writeback_stat_item counters */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 907c9b1e1e61..12eb11e70939 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1555,6 +1555,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
d_off += size;
d_size -= size;
+ /*
+ * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic()
+ * calls must occurs in reverse order of calls to kmap_atomic().
+ * So, to call kunmap_atomic(s_addr) we should first call
+ * kunmap_atomic(d_addr). For more details see
+ * Documentation/mm/highmem.rst.
+ */
if (s_off >= PAGE_SIZE) {
kunmap_atomic(d_addr);
kunmap_atomic(s_addr);
@@ -2103,8 +2110,6 @@ unsigned long zs_compact(struct zs_pool *pool)
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
- if (!class)
- continue;
if (class->index != i)
continue;
pages_freed += __zs_compact(pool, class);
@@ -2149,8 +2154,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
- if (!class)
- continue;
if (class->index != i)
continue;
@@ -2308,9 +2311,6 @@ void zs_destroy_pool(struct zs_pool *pool)
int fg;
struct size_class *class = pool->size_class[i];
- if (!class)
- continue;
-
if (class->index != i)
continue;
diff --git a/mm/zswap.c b/mm/zswap.c
index 104835b379ec..2d48fd59cc7a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1026,7 +1026,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
SetPageReclaim(page);
/* start writeback */
- __swap_writepage(page, &wbc, end_swap_bio_write);
+ __swap_writepage(page, &wbc);
put_page(page);
zswap_written_back_pages++;
diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h
index 6c1aa92a92e4..6ce1f1ceb432 100644
--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -77,6 +77,8 @@
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
+#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 31e5eea2a9b9..7b9dc2426f18 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -30,7 +30,6 @@ map_fixed_noreplace
write_to_hugetlbfs
hmm-tests
memfd_secret
-local_config.*
soft-dirty
split_huge_page_test
ksm_tests
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index d9fa6a9ea584..4ae879f70f4c 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,9 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
# Makefile for vm selftests
-LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h
-
-include local_config.mk
+LOCAL_HDRS += $(top_srcdir)/mm/gup_test.h
uname_M := $(shell uname -m 2>/dev/null || echo not)
MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
@@ -152,23 +150,6 @@ endif
$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
-# HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
-$(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS)
-
$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
$(OUTPUT)/migration: LDLIBS += -lnuma
-
-local_config.mk local_config.h: check_config.sh
- /bin/sh ./check_config.sh $(CC)
-
-EXTRA_CLEAN += local_config.mk local_config.h
-
-ifeq ($(HMM_EXTRA_LIBS),)
-all: warn_missing_hugelibs
-
-warn_missing_hugelibs:
- @echo ; \
- echo "Warning: missing libhugetlbfs support. Some HMM tests will be skipped." ; \
- echo
-endif
diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh
deleted file mode 100644
index 079c8a40b85d..000000000000
--- a/tools/testing/selftests/vm/check_config.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# Probe for libraries and create header files to record the results. Both C
-# header files and Makefile include fragments are created.
-
-OUTPUT_H_FILE=local_config.h
-OUTPUT_MKFILE=local_config.mk
-
-# libhugetlbfs
-tmpname=$(mktemp)
-tmpfile_c=${tmpname}.c
-tmpfile_o=${tmpname}.o
-
-echo "#include <sys/types.h>" > $tmpfile_c
-echo "#include <hugetlbfs.h>" >> $tmpfile_c
-echo "int func(void) { return 0; }" >> $tmpfile_c
-
-CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
-$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
-
-if [ -f $tmpfile_o ]; then
- echo "#define LOCAL_CONFIG_HAVE_LIBHUGETLBFS 1" > $OUTPUT_H_FILE
- echo "HMM_EXTRA_LIBS = -lhugetlbfs" > $OUTPUT_MKFILE
-else
- echo "// No libhugetlbfs support found" > $OUTPUT_H_FILE
- echo "# No libhugetlbfs support found, so:" > $OUTPUT_MKFILE
- echo "HMM_EXTRA_LIBS = " >> $OUTPUT_MKFILE
-fi
-
-rm ${tmpname}.*
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 529f53b40296..f2c2c970eeb2 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -26,10 +26,6 @@
#include <sys/mman.h>
#include <sys/ioctl.h>
-#include "./local_config.h"
-#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS
-#include <hugetlbfs.h>
-#endif
/*
* This is a private UAPI to the kernel test module so it isn't exported
@@ -733,7 +729,54 @@ TEST_F(hmm, anon_write_huge)
hmm_buffer_free(buffer);
}
-#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS
+/*
+ * Read numeric data from raw and tagged kernel status files. Used to read
+ * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag).
+ */
+static long file_read_ulong(char *file, const char *tag)
+{
+ int fd;
+ char buf[2048];
+ int len;
+ char *p, *q;
+ long val;
+
+ fd = open(file, O_RDONLY);
+ if (fd < 0) {
+ /* Error opening the file */
+ return -1;
+ }
+
+ len = read(fd, buf, sizeof(buf));
+ close(fd);
+ if (len < 0) {
+ /* Error in reading the file */
+ return -1;
+ }
+ if (len == sizeof(buf)) {
+ /* Error file is too large */
+ return -1;
+ }
+ buf[len] = '\0';
+
+ /* Search for a tag if provided */
+ if (tag) {
+ p = strstr(buf, tag);
+ if (!p)
+ return -1; /* looks like the line we want isn't there */
+ p += strlen(tag);
+ } else
+ p = buf;
+
+ val = strtol(p, &q, 0);
+ if (*q != ' ') {
+ /* Error parsing the file */
+ return -1;
+ }
+
+ return val;
+}
+
/*
* Write huge TLBFS page.
*/
@@ -742,29 +785,27 @@ TEST_F(hmm, anon_write_hugetlbfs)
struct hmm_buffer *buffer;
unsigned long npages;
unsigned long size;
+ unsigned long default_hsize;
unsigned long i;
int *ptr;
int ret;
- long pagesizes[4];
- int n, idx;
-
- /* Skip test if we can't allocate a hugetlbfs page. */
- n = gethugepagesizes(pagesizes, 4);
- if (n <= 0)
+ default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+ if (default_hsize < 0 || default_hsize*1024 < default_hsize)
SKIP(return, "Huge page size could not be determined");
- for (idx = 0; --n > 0; ) {
- if (pagesizes[n] < pagesizes[idx])
- idx = n;
- }
- size = ALIGN(TWOMEG, pagesizes[idx]);
+ default_hsize = default_hsize*1024; /* KB to B */
+
+ size = ALIGN(TWOMEG, default_hsize);
npages = size >> self->page_shift;
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
- buffer->ptr = get_hugepage_region(size, GHR_STRICT);
- if (buffer->ptr == NULL) {
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+ if (buffer->ptr == MAP_FAILED) {
free(buffer);
SKIP(return, "Huge page could not be allocated");
}
@@ -788,11 +829,10 @@ TEST_F(hmm, anon_write_hugetlbfs)
for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
ASSERT_EQ(ptr[i], i);
- free_hugepage_region(buffer->ptr);
+ munmap(buffer->ptr, buffer->size);
buffer->ptr = NULL;
hmm_buffer_free(buffer);
}
-#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */
/*
* Read mmap'ed file memory.
@@ -1467,7 +1507,6 @@ TEST_F(hmm2, snapshot)
hmm_buffer_free(buffer);
}
-#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS
/*
* Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
* should be mapped by a large page table entry.
@@ -1477,30 +1516,30 @@ TEST_F(hmm, compound)
struct hmm_buffer *buffer;
unsigned long npages;
unsigned long size;
+ unsigned long default_hsize;
int *ptr;
unsigned char *m;
int ret;
- long pagesizes[4];
- int n, idx;
unsigned long i;
/* Skip test if we can't allocate a hugetlbfs page. */
- n = gethugepagesizes(pagesizes, 4);
- if (n <= 0)
- return;
- for (idx = 0; --n > 0; ) {
- if (pagesizes[n] < pagesizes[idx])
- idx = n;
- }
- size = ALIGN(TWOMEG, pagesizes[idx]);
+ default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+ if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+ SKIP(return, "Huge page size could not be determined");
+ default_hsize = default_hsize*1024; /* KB to B */
+
+ size = ALIGN(TWOMEG, default_hsize);
npages = size >> self->page_shift;
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
- buffer->ptr = get_hugepage_region(size, GHR_STRICT);
- if (buffer->ptr == NULL) {
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+ if (buffer->ptr == MAP_FAILED) {
free(buffer);
return;
}
@@ -1539,11 +1578,10 @@ TEST_F(hmm, compound)
ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ |
HMM_DMIRROR_PROT_PMD);
- free_hugepage_region(buffer->ptr);
+ munmap(buffer->ptr, buffer->size);
buffer->ptr = NULL;
hmm_buffer_free(buffer);
}
-#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */
/*
* Test two devices reading the same memory (double mapped).
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
index 155120b67a16..b77b1e28cdb3 100644
--- a/tools/testing/selftests/vm/khugepaged.c
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -14,6 +14,9 @@
#ifndef MADV_PAGEOUT
#define MADV_PAGEOUT 21
#endif
+#ifndef MADV_COLLAPSE
+#define MADV_COLLAPSE 25
+#endif
#define BASE_ADDR ((void *)(1UL << 30))
static unsigned long hpage_pmd_size;
@@ -23,6 +26,11 @@ static int hpage_pmd_nr;
#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
#define PID_SMAPS "/proc/self/smaps"
+struct collapse_context {
+ void (*collapse)(const char *msg, char *p, int nr_hpages, bool expect);
+ bool enforce_pte_scan_limits;
+};
+
enum thp_enabled {
THP_ALWAYS,
THP_MADVISE,
@@ -90,18 +98,6 @@ struct settings {
struct khugepaged_settings khugepaged;
};
-static struct settings default_settings = {
- .thp_enabled = THP_MADVISE,
- .thp_defrag = THP_DEFRAG_ALWAYS,
- .shmem_enabled = SHMEM_NEVER,
- .use_zero_page = 0,
- .khugepaged = {
- .defrag = 1,
- .alloc_sleep_millisecs = 10,
- .scan_sleep_millisecs = 10,
- },
-};
-
static struct settings saved_settings;
static bool skip_settings_restore;
@@ -279,6 +275,39 @@ static void write_settings(struct settings *settings)
write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
}
+#define MAX_SETTINGS_DEPTH 4
+static struct settings settings_stack[MAX_SETTINGS_DEPTH];
+static int settings_index;
+
+static struct settings *current_settings(void)
+{
+ if (!settings_index) {
+ printf("Fail: No settings set");
+ exit(EXIT_FAILURE);
+ }
+ return settings_stack + settings_index - 1;
+}
+
+static void push_settings(struct settings *settings)
+{
+ if (settings_index >= MAX_SETTINGS_DEPTH) {
+ printf("Fail: Settings stack exceeded");
+ exit(EXIT_FAILURE);
+ }
+ settings_stack[settings_index++] = *settings;
+ write_settings(current_settings());
+}
+
+static void pop_settings(void)
+{
+ if (settings_index <= 0) {
+ printf("Fail: Settings stack empty");
+ exit(EXIT_FAILURE);
+ }
+ --settings_index;
+ write_settings(current_settings());
+}
+
static void restore_settings(int sig)
{
if (skip_settings_restore)
@@ -322,14 +351,6 @@ static void save_settings(void)
signal(SIGQUIT, restore_settings);
}
-static void adjust_settings(void)
-{
-
- printf("Adjust settings...");
- write_settings(&default_settings);
- success("OK");
-}
-
#define MAX_LINE_LENGTH 500
static bool check_for_pattern(FILE *fp, char *pattern, char *buf)
@@ -341,7 +362,7 @@ static bool check_for_pattern(FILE *fp, char *pattern, char *buf)
return false;
}
-static bool check_huge(void *addr)
+static bool check_huge(void *addr, int nr_hpages)
{
bool thp = false;
int ret;
@@ -366,7 +387,7 @@ static bool check_huge(void *addr)
goto err_out;
ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB",
- hpage_pmd_size >> 10);
+ nr_hpages * (hpage_pmd_size >> 10));
if (ret >= MAX_LINE_LENGTH) {
printf("%s: Pattern is too long\n", __func__);
exit(EXIT_FAILURE);
@@ -434,12 +455,12 @@ err_out:
return swap;
}
-static void *alloc_mapping(void)
+static void *alloc_mapping(int nr)
{
void *p;
- p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (p != BASE_ADDR) {
printf("Failed to allocate VMA at %p\n", BASE_ADDR);
exit(EXIT_FAILURE);
@@ -456,6 +477,25 @@ static void fill_memory(int *p, unsigned long start, unsigned long end)
p[i * page_size / sizeof(*p)] = i + 0xdead0000;
}
+/*
+ * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
+ * validate_memory()'able contents.
+ */
+static void *alloc_hpage(void)
+{
+ void *p;
+
+ p = alloc_mapping(1);
+ printf("Allocate huge page...");
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+ return p;
+}
+
static void validate_memory(int *p, unsigned long start, unsigned long end)
{
int i;
@@ -469,26 +509,59 @@ static void validate_memory(int *p, unsigned long start, unsigned long end)
}
}
+static void madvise_collapse(const char *msg, char *p, int nr_hpages,
+ bool expect)
+{
+ int ret;
+ struct settings settings = *current_settings();
+
+ printf("%s...", msg);
+ /* Sanity check */
+ if (!check_huge(p, 0)) {
+ printf("Unexpected huge page\n");
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * Prevent khugepaged interference and tests that MADV_COLLAPSE
+ * ignores /sys/kernel/mm/transparent_hugepage/enabled
+ */
+ settings.thp_enabled = THP_NEVER;
+ push_settings(&settings);
+
+ /* Clear VM_NOHUGEPAGE */
+ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+ ret = madvise(p, nr_hpages * hpage_pmd_size, MADV_COLLAPSE);
+ if (((bool)ret) == expect)
+ fail("Fail: Bad return value");
+ else if (check_huge(p, nr_hpages) != expect)
+ fail("Fail: check_huge()");
+ else
+ success("OK");
+
+ pop_settings();
+}
+
#define TICK 500000
-static bool wait_for_scan(const char *msg, char *p)
+static bool wait_for_scan(const char *msg, char *p, int nr_hpages)
{
int full_scans;
int timeout = 6; /* 3 seconds */
/* Sanity check */
- if (check_huge(p)) {
+ if (!check_huge(p, 0)) {
printf("Unexpected huge page\n");
exit(EXIT_FAILURE);
}
- madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
/* Wait until the second full_scan completed */
full_scans = read_num("khugepaged/full_scans") + 2;
printf("%s...", msg);
while (timeout--) {
- if (check_huge(p))
+ if (check_huge(p, nr_hpages))
break;
if (read_num("khugepaged/full_scans") >= full_scans)
break;
@@ -496,121 +569,121 @@ static bool wait_for_scan(const char *msg, char *p)
usleep(TICK);
}
- madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
return timeout == -1;
}
+static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
+ bool expect)
+{
+ if (wait_for_scan(msg, p, nr_hpages)) {
+ if (expect)
+ fail("Timeout");
+ else
+ success("OK");
+ return;
+ } else if (check_huge(p, nr_hpages) == expect) {
+ success("OK");
+ } else {
+ fail("Fail");
+ }
+}
+
static void alloc_at_fault(void)
{
- struct settings settings = default_settings;
+ struct settings settings = *current_settings();
char *p;
settings.thp_enabled = THP_ALWAYS;
- write_settings(&settings);
+ push_settings(&settings);
- p = alloc_mapping();
+ p = alloc_mapping(1);
*p = 1;
printf("Allocate huge page on fault...");
- if (check_huge(p))
+ if (check_huge(p, 1))
success("OK");
else
fail("Fail");
- write_settings(&default_settings);
+ pop_settings();
madvise(p, page_size, MADV_DONTNEED);
printf("Split huge PMD on MADV_DONTNEED...");
- if (!check_huge(p))
+ if (check_huge(p, 0))
success("OK");
else
fail("Fail");
munmap(p, hpage_pmd_size);
}
-static void collapse_full(void)
+static void collapse_full(struct collapse_context *c)
{
void *p;
-
- p = alloc_mapping();
- fill_memory(p, 0, hpage_pmd_size);
- if (wait_for_scan("Collapse fully populated PTE table", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
- validate_memory(p, 0, hpage_pmd_size);
- munmap(p, hpage_pmd_size);
+ int nr_hpages = 4;
+ unsigned long size = nr_hpages * hpage_pmd_size;
+
+ p = alloc_mapping(nr_hpages);
+ fill_memory(p, 0, size);
+ c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
+ true);
+ validate_memory(p, 0, size);
+ munmap(p, size);
}
-static void collapse_empty(void)
+static void collapse_empty(struct collapse_context *c)
{
void *p;
- p = alloc_mapping();
- if (wait_for_scan("Do not collapse empty PTE table", p))
- fail("Timeout");
- else if (check_huge(p))
- fail("Fail");
- else
- success("OK");
+ p = alloc_mapping(1);
+ c->collapse("Do not collapse empty PTE table", p, 1, false);
munmap(p, hpage_pmd_size);
}
-static void collapse_single_pte_entry(void)
+static void collapse_single_pte_entry(struct collapse_context *c)
{
void *p;
- p = alloc_mapping();
+ p = alloc_mapping(1);
fill_memory(p, 0, page_size);
- if (wait_for_scan("Collapse PTE table with single PTE entry present", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
+ c->collapse("Collapse PTE table with single PTE entry present", p,
+ 1, true);
validate_memory(p, 0, page_size);
munmap(p, hpage_pmd_size);
}
-static void collapse_max_ptes_none(void)
+static void collapse_max_ptes_none(struct collapse_context *c)
{
int max_ptes_none = hpage_pmd_nr / 2;
- struct settings settings = default_settings;
+ struct settings settings = *current_settings();
void *p;
settings.khugepaged.max_ptes_none = max_ptes_none;
- write_settings(&settings);
+ push_settings(&settings);
- p = alloc_mapping();
+ p = alloc_mapping(1);
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
- if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p))
- fail("Timeout");
- else if (check_huge(p))
- fail("Fail");
- else
- success("OK");
+ c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
+ !c->enforce_pte_scan_limits);
validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
- fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
- if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
- validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+ if (c->enforce_pte_scan_limits) {
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+ c->collapse("Collapse with max_ptes_none PTEs empty", p, 1,
+ true);
+ validate_memory(p, 0,
+ (hpage_pmd_nr - max_ptes_none) * page_size);
+ }
munmap(p, hpage_pmd_size);
- write_settings(&default_settings);
+ pop_settings();
}
-static void collapse_swapin_single_pte(void)
+static void collapse_swapin_single_pte(struct collapse_context *c)
{
void *p;
- p = alloc_mapping();
+ p = alloc_mapping(1);
fill_memory(p, 0, hpage_pmd_size);
printf("Swapout one page...");
@@ -625,23 +698,18 @@ static void collapse_swapin_single_pte(void)
goto out;
}
- if (wait_for_scan("Collapse with swapping in single PTE entry", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
+ c->collapse("Collapse with swapping in single PTE entry", p, 1, true);
validate_memory(p, 0, hpage_pmd_size);
out:
munmap(p, hpage_pmd_size);
}
-static void collapse_max_ptes_swap(void)
+static void collapse_max_ptes_swap(struct collapse_context *c)
{
int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
void *p;
- p = alloc_mapping();
+ p = alloc_mapping(1);
fill_memory(p, 0, hpage_pmd_size);
printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
@@ -656,115 +724,83 @@ static void collapse_max_ptes_swap(void)
goto out;
}
- if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p))
- fail("Timeout");
- else if (check_huge(p))
- fail("Fail");
- else
- success("OK");
+ c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1,
+ !c->enforce_pte_scan_limits);
validate_memory(p, 0, hpage_pmd_size);
- fill_memory(p, 0, hpage_pmd_size);
- printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr);
- if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
- perror("madvise(MADV_PAGEOUT)");
- exit(EXIT_FAILURE);
- }
- if (check_swap(p, max_ptes_swap * page_size)) {
- success("OK");
- } else {
- fail("Fail");
- goto out;
- }
+ if (c->enforce_pte_scan_limits) {
+ fill_memory(p, 0, hpage_pmd_size);
+ printf("Swapout %d of %d pages...", max_ptes_swap,
+ hpage_pmd_nr);
+ if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, max_ptes_swap * page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
- if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
- validate_memory(p, 0, hpage_pmd_size);
+ c->collapse("Collapse with max_ptes_swap pages swapped out", p,
+ 1, true);
+ validate_memory(p, 0, hpage_pmd_size);
+ }
out:
munmap(p, hpage_pmd_size);
}
-static void collapse_single_pte_entry_compound(void)
+static void collapse_single_pte_entry_compound(struct collapse_context *c)
{
void *p;
- p = alloc_mapping();
-
- printf("Allocate huge page...");
- madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
- fill_memory(p, 0, hpage_pmd_size);
- if (check_huge(p))
- success("OK");
- else
- fail("Fail");
+ p = alloc_hpage();
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-
printf("Split huge page leaving single PTE mapping compound page...");
madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
- if (!check_huge(p))
+ if (check_huge(p, 0))
success("OK");
else
fail("Fail");
- if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
+ c->collapse("Collapse PTE table with single PTE mapping compound page",
+ p, 1, true);
validate_memory(p, 0, page_size);
munmap(p, hpage_pmd_size);
}
-static void collapse_full_of_compound(void)
+static void collapse_full_of_compound(struct collapse_context *c)
{
void *p;
- p = alloc_mapping();
-
- printf("Allocate huge page...");
- madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
- fill_memory(p, 0, hpage_pmd_size);
- if (check_huge(p))
- success("OK");
- else
- fail("Fail");
-
+ p = alloc_hpage();
printf("Split huge page leaving single PTE page table full of compound pages...");
madvise(p, page_size, MADV_NOHUGEPAGE);
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
- if (!check_huge(p))
+ if (check_huge(p, 0))
success("OK");
else
fail("Fail");
- if (wait_for_scan("Collapse PTE table full of compound pages", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
+ c->collapse("Collapse PTE table full of compound pages", p, 1, true);
validate_memory(p, 0, hpage_pmd_size);
munmap(p, hpage_pmd_size);
}
-static void collapse_compound_extreme(void)
+static void collapse_compound_extreme(struct collapse_context *c)
{
void *p;
int i;
- p = alloc_mapping();
+ p = alloc_mapping(1);
for (i = 0; i < hpage_pmd_nr; i++) {
printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
i + 1, hpage_pmd_nr);
madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
fill_memory(BASE_ADDR, 0, hpage_pmd_size);
- if (!check_huge(BASE_ADDR)) {
+ if (!check_huge(BASE_ADDR, 1)) {
printf("Failed to allocate huge page\n");
exit(EXIT_FAILURE);
}
@@ -793,32 +829,28 @@ static void collapse_compound_extreme(void)
munmap(BASE_ADDR, hpage_pmd_size);
fill_memory(p, 0, hpage_pmd_size);
- if (!check_huge(p))
+ if (check_huge(p, 0))
success("OK");
else
fail("Fail");
- if (wait_for_scan("Collapse PTE table full of different compound pages", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
+ c->collapse("Collapse PTE table full of different compound pages", p, 1,
+ true);
validate_memory(p, 0, hpage_pmd_size);
munmap(p, hpage_pmd_size);
}
-static void collapse_fork(void)
+static void collapse_fork(struct collapse_context *c)
{
int wstatus;
void *p;
- p = alloc_mapping();
+ p = alloc_mapping(1);
printf("Allocate small page...");
fill_memory(p, 0, page_size);
- if (!check_huge(p))
+ if (check_huge(p, 0))
success("OK");
else
fail("Fail");
@@ -829,19 +861,14 @@ static void collapse_fork(void)
skip_settings_restore = true;
exit_status = 0;
- if (!check_huge(p))
+ if (check_huge(p, 0))
success("OK");
else
fail("Fail");
fill_memory(p, page_size, 2 * page_size);
-
- if (wait_for_scan("Collapse PTE table with single page shared with parent process", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
+ c->collapse("Collapse PTE table with single page shared with parent process",
+ p, 1, true);
validate_memory(p, 0, page_size);
munmap(p, hpage_pmd_size);
@@ -852,7 +879,7 @@ static void collapse_fork(void)
exit_status += WEXITSTATUS(wstatus);
printf("Check if parent still has small page...");
- if (!check_huge(p))
+ if (check_huge(p, 0))
success("OK");
else
fail("Fail");
@@ -860,28 +887,19 @@ static void collapse_fork(void)
munmap(p, hpage_pmd_size);
}
-static void collapse_fork_compound(void)
+static void collapse_fork_compound(struct collapse_context *c)
{
int wstatus;
void *p;
- p = alloc_mapping();
-
- printf("Allocate huge page...");
- madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
- fill_memory(p, 0, hpage_pmd_size);
- if (check_huge(p))
- success("OK");
- else
- fail("Fail");
-
+ p = alloc_hpage();
printf("Share huge page over fork()...");
if (!fork()) {
/* Do not touch settings on child exit */
skip_settings_restore = true;
exit_status = 0;
- if (check_huge(p))
+ if (check_huge(p, 1))
success("OK");
else
fail("Fail");
@@ -889,21 +907,17 @@ static void collapse_fork_compound(void)
printf("Split huge page PMD in child process...");
madvise(p, page_size, MADV_NOHUGEPAGE);
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
- if (!check_huge(p))
+ if (check_huge(p, 0))
success("OK");
else
fail("Fail");
fill_memory(p, 0, page_size);
write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
- if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
+ c->collapse("Collapse PTE table full of compound pages in child",
+ p, 1, true);
write_num("khugepaged/max_ptes_shared",
- default_settings.khugepaged.max_ptes_shared);
+ current_settings()->khugepaged.max_ptes_shared);
validate_memory(p, 0, hpage_pmd_size);
munmap(p, hpage_pmd_size);
@@ -914,7 +928,7 @@ static void collapse_fork_compound(void)
exit_status += WEXITSTATUS(wstatus);
printf("Check if parent still has huge page...");
- if (check_huge(p))
+ if (check_huge(p, 1))
success("OK");
else
fail("Fail");
@@ -922,29 +936,20 @@ static void collapse_fork_compound(void)
munmap(p, hpage_pmd_size);
}
-static void collapse_max_ptes_shared()
+static void collapse_max_ptes_shared(struct collapse_context *c)
{
int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
int wstatus;
void *p;
- p = alloc_mapping();
-
- printf("Allocate huge page...");
- madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
- fill_memory(p, 0, hpage_pmd_size);
- if (check_huge(p))
- success("OK");
- else
- fail("Fail");
-
+ p = alloc_hpage();
printf("Share huge page over fork()...");
if (!fork()) {
/* Do not touch settings on child exit */
skip_settings_restore = true;
exit_status = 0;
- if (check_huge(p))
+ if (check_huge(p, 1))
success("OK");
else
fail("Fail");
@@ -952,33 +957,27 @@ static void collapse_max_ptes_shared()
printf("Trigger CoW on page %d of %d...",
hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
- if (!check_huge(p))
+ if (check_huge(p, 0))
success("OK");
else
fail("Fail");
- if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p))
- fail("Timeout");
- else if (!check_huge(p))
- success("OK");
- else
- fail("Fail");
-
- printf("Trigger CoW on page %d of %d...",
- hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
- fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
- if (!check_huge(p))
- success("OK");
- else
- fail("Fail");
-
-
- if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
- fail("Timeout");
- else if (check_huge(p))
- success("OK");
- else
- fail("Fail");
+ c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
+ 1, !c->enforce_pte_scan_limits);
+
+ if (c->enforce_pte_scan_limits) {
+ printf("Trigger CoW on page %d of %d...",
+ hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) *
+ page_size);
+ if (check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Collapse with max_ptes_shared PTEs shared",
+ p, 1, true);
+ }
validate_memory(p, 0, hpage_pmd_size);
munmap(p, hpage_pmd_size);
@@ -989,7 +988,7 @@ static void collapse_max_ptes_shared()
exit_status += WEXITSTATUS(wstatus);
printf("Check if parent still has huge page...");
- if (check_huge(p))
+ if (check_huge(p, 1))
success("OK");
else
fail("Fail");
@@ -997,8 +996,52 @@ static void collapse_max_ptes_shared()
munmap(p, hpage_pmd_size);
}
-int main(void)
+static void madvise_collapse_existing_thps(void)
{
+ void *p;
+ int err;
+
+ p = alloc_mapping(1);
+ fill_memory(p, 0, hpage_pmd_size);
+
+ printf("Collapse fully populated PTE table...");
+ /*
+ * Note that we don't set MADV_HUGEPAGE here, which
+ * also tests that VM_HUGEPAGE isn't required for
+ * MADV_COLLAPSE in "madvise" mode.
+ */
+ err = madvise(p, hpage_pmd_size, MADV_COLLAPSE);
+ if (err == 0 && check_huge(p, 1)) {
+ success("OK");
+ printf("Re-collapse PMD-mapped hugepage");
+ err = madvise(p, hpage_pmd_size, MADV_COLLAPSE);
+ if (err == 0 && check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+ } else {
+ fail("Fail");
+ }
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+int main(int argc, const char **argv)
+{
+ struct collapse_context c;
+ struct settings default_settings = {
+ .thp_enabled = THP_MADVISE,
+ .thp_defrag = THP_DEFRAG_ALWAYS,
+ .shmem_enabled = SHMEM_NEVER,
+ .use_zero_page = 0,
+ .khugepaged = {
+ .defrag = 1,
+ .alloc_sleep_millisecs = 10,
+ .scan_sleep_millisecs = 10,
+ },
+ };
+ const char *tests = argc == 1 ? "all" : argv[1];
+
setbuf(stdout, NULL);
page_size = getpagesize();
@@ -1011,21 +1054,47 @@ int main(void)
default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
save_settings();
- adjust_settings();
+ push_settings(&default_settings);
alloc_at_fault();
- collapse_full();
- collapse_empty();
- collapse_single_pte_entry();
- collapse_max_ptes_none();
- collapse_swapin_single_pte();
- collapse_max_ptes_swap();
- collapse_single_pte_entry_compound();
- collapse_full_of_compound();
- collapse_compound_extreme();
- collapse_fork();
- collapse_fork_compound();
- collapse_max_ptes_shared();
+
+ if (!strcmp(tests, "khugepaged") || !strcmp(tests, "all")) {
+ printf("\n*** Testing context: khugepaged ***\n");
+ c.collapse = &khugepaged_collapse;
+ c.enforce_pte_scan_limits = true;
+
+ collapse_full(&c);
+ collapse_empty(&c);
+ collapse_single_pte_entry(&c);
+ collapse_max_ptes_none(&c);
+ collapse_swapin_single_pte(&c);
+ collapse_max_ptes_swap(&c);
+ collapse_single_pte_entry_compound(&c);
+ collapse_full_of_compound(&c);
+ collapse_compound_extreme(&c);
+ collapse_fork(&c);
+ collapse_fork_compound(&c);
+ collapse_max_ptes_shared(&c);
+ }
+ if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) {
+ printf("\n*** Testing context: madvise ***\n");
+ c.collapse = &madvise_collapse;
+ c.enforce_pte_scan_limits = false;
+
+ collapse_full(&c);
+ collapse_empty(&c);
+ collapse_single_pte_entry(&c);
+ collapse_max_ptes_none(&c);
+ collapse_swapin_single_pte(&c);
+ collapse_max_ptes_swap(&c);
+ collapse_single_pte_entry_compound(&c);
+ collapse_full_of_compound(&c);
+ collapse_compound_extreme(&c);
+ collapse_fork(&c);
+ collapse_fork_compound(&c);
+ collapse_max_ptes_shared(&c);
+ madvise_collapse_existing_thps();
+ }
restore_settings(0);
}
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index de86983b8a0f..e780e76c26b8 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -120,11 +120,16 @@ run_test ./gup_test -a
# Dump pages 0, 19, and 4096, using pin_user_pages:
run_test ./gup_test -ct -F 0x1 0 19 0x1000
-run_test ./userfaultfd anon 20 16
-# Test requires source and destination huge pages. Size of source
-# (half_ufd_size_MB) is passed as argument to test.
-run_test ./userfaultfd hugetlb "$half_ufd_size_MB" 32
-run_test ./userfaultfd shmem 20 16
+uffd_mods=("" ":dev")
+for mod in "${uffd_mods[@]}"; do
+ run_test ./userfaultfd anon${mod} 20 16
+ # Hugetlb tests require source and destination huge pages. Pass in half
+ # the size ($half_ufd_size_MB), which is used for *each*.
+ run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
+ run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 "$mnt"/uffd-test
+ rm -f "$mnt"/uffd-test
+ run_test ./userfaultfd shmem${mod} 20 16
+done
#cleanup
umount "$mnt"
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 7c3f1b0ab468..7be709d9eed0 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -77,6 +77,11 @@ static int bounces;
#define TEST_SHMEM 3
static int test_type;
+#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
+
+/* test using /dev/userfaultfd, instead of userfaultfd(2) */
+static bool test_dev_userfaultfd;
+
/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
#define ALARM_INTERVAL_SECS 10
static volatile bool test_uffdio_copy_eexist = true;
@@ -125,6 +130,8 @@ struct uffd_stats {
const char *examples =
"# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
"./userfaultfd anon 100 99999\n\n"
+ "# Run the same anonymous memory test, but using /dev/userfaultfd:\n"
+ "./userfaultfd anon:dev 100 99999\n\n"
"# Run share memory test on 1GiB region with 99 bounces:\n"
"./userfaultfd shmem 1000 99\n\n"
"# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
@@ -141,6 +148,14 @@ static void usage(void)
"[hugetlbfs_file]\n\n");
fprintf(stderr, "Supported <test type>: anon, hugetlb, "
"hugetlb_shared, shmem\n\n");
+ fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. "
+ "Supported mods:\n");
+ fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n");
+ fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n");
+ fprintf(stderr, "\nExample test mod usage:\n");
+ fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n");
+ fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n");
+
fprintf(stderr, "Examples:\n\n");
fprintf(stderr, "%s", examples);
exit(1);
@@ -154,12 +169,14 @@ static void usage(void)
ret, __LINE__); \
} while (0)
-#define err(fmt, ...) \
+#define errexit(exitcode, fmt, ...) \
do { \
_err(fmt, ##__VA_ARGS__); \
- exit(1); \
+ exit(exitcode); \
} while (0)
+#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
+
static void uffd_stats_reset(struct uffd_stats *uffd_stats,
unsigned long n_cpus)
{
@@ -383,13 +400,34 @@ static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
}
}
+static int __userfaultfd_open_dev(void)
+{
+ int fd, _uffd;
+
+ fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+ errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
+
+ _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
+ if (_uffd < 0)
+ errexit(errno == ENOTTY ? KSFT_SKIP : 1,
+ "creating userfaultfd failed");
+ close(fd);
+ return _uffd;
+}
+
static void userfaultfd_open(uint64_t *features)
{
struct uffdio_api uffdio_api;
- uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
- if (uffd < 0)
- err("userfaultfd syscall not available in this kernel");
+ if (test_dev_userfaultfd)
+ uffd = __userfaultfd_open_dev();
+ else {
+ uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
+ if (uffd < 0)
+ errexit(errno == ENOSYS ? KSFT_SKIP : 1,
+ "creating userfaultfd failed");
+ }
uffd_flags = fcntl(uffd, F_GETFD, NULL);
uffdio_api.api = UFFD_API;
@@ -1584,8 +1622,6 @@ unsigned long default_huge_page_size(void)
static void set_test_type(const char *type)
{
- uint64_t features = UFFD_API_FEATURES;
-
if (!strcmp(type, "anon")) {
test_type = TEST_ANON;
uffd_test_ops = &anon_uffd_test_ops;
@@ -1603,9 +1639,29 @@ static void set_test_type(const char *type)
test_type = TEST_SHMEM;
uffd_test_ops = &shmem_uffd_test_ops;
test_uffdio_minor = true;
- } else {
- err("Unknown test type: %s", type);
}
+}
+
+static void parse_test_type_arg(const char *raw_type)
+{
+ char *buf = strdup(raw_type);
+ uint64_t features = UFFD_API_FEATURES;
+
+ while (buf) {
+ const char *token = strsep(&buf, ":");
+
+ if (!test_type)
+ set_test_type(token);
+ else if (!strcmp(token, "dev"))
+ test_dev_userfaultfd = true;
+ else if (!strcmp(token, "syscall"))
+ test_dev_userfaultfd = false;
+ else
+ err("unrecognized test mod '%s'", token);
+ }
+
+ if (!test_type)
+ err("failed to parse test type argument: '%s'", raw_type);
if (test_type == TEST_HUGETLB)
page_size = default_huge_page_size();
@@ -1653,7 +1709,7 @@ int main(int argc, char **argv)
err("failed to arm SIGALRM");
alarm(ALARM_INTERVAL_SECS);
- set_test_type(argv[1]);
+ parse_test_type_arg(argv[1]);
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index ec2e67c85b84..ce860ab94162 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -470,7 +470,12 @@ static bool match_str_list(const char *str, char **list, int list_size)
static bool is_need(char *buf)
{
- if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0)
+ __u64 ts_nsec, free_ts_nsec;
+
+ ts_nsec = get_ts_nsec(buf);
+ free_ts_nsec = get_free_ts_nsec(buf);
+
+ if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec)
return false;
if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size))
return false;