summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-bus-dax153
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-cma6
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-mempolicy4
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave25
-rw-r--r--Documentation/admin-guide/mm/damon/usage.rst127
-rw-r--r--Documentation/admin-guide/mm/numa_memory_policy.rst9
-rw-r--r--Documentation/dev-tools/kasan.rst20
-rw-r--r--Documentation/mm/damon/design.rst50
-rw-r--r--Documentation/mm/damon/maintainer-profile.rst8
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst20
-rw-r--r--Documentation/translations/zh_CN/dev-tools/kasan.rst20
-rw-r--r--Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst20
-rw-r--r--Documentation/translations/zh_TW/dev-tools/kasan.rst20
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/arc/Kconfig1
-rw-r--r--arch/arc/include/asm/cachetype.h9
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/Kconfig.debug2
-rw-r--r--arch/arm/configs/aspeed_g4_defconfig2
-rw-r--r--arch/arm/configs/aspeed_g5_defconfig2
-rw-r--r--arch/arm/include/asm/cachetype.h2
-rw-r--r--arch/arm/include/asm/pgtable.h2
-rw-r--r--arch/arm/include/asm/ptdump.h6
-rw-r--r--arch/arm/mm/init.c2
-rw-r--r--arch/arm/mm/mmu.c2
-rw-r--r--arch/arm64/Kconfig9
-rw-r--r--arch/arm64/include/asm/pgtable.h431
-rw-r--r--arch/arm64/include/asm/ptdump.h7
-rw-r--r--arch/arm64/include/asm/tlbflush.h13
-rw-r--r--arch/arm64/kernel/efi.c4
-rw-r--r--arch/arm64/kernel/mte.c2
-rw-r--r--arch/arm64/kvm/guest.c2
-rw-r--r--arch/arm64/mm/Makefile1
-rw-r--r--arch/arm64/mm/contpte.c404
-rw-r--r--arch/arm64/mm/fault.c12
-rw-r--r--arch/arm64/mm/fixmap.c4
-rw-r--r--arch/arm64/mm/hugetlbpage.c47
-rw-r--r--arch/arm64/mm/kasan_init.c6
-rw-r--r--arch/arm64/mm/mmu.c18
-rw-r--r--arch/arm64/mm/pageattr.c6
-rw-r--r--arch/arm64/mm/ptdump.c11
-rw-r--r--arch/arm64/mm/trans_pgd.c6
-rw-r--r--arch/csky/Kconfig1
-rw-r--r--arch/csky/include/asm/cachetype.h9
-rw-r--r--arch/m68k/Kconfig1
-rw-r--r--arch/m68k/include/asm/cachetype.h9
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/include/asm/cachetype.h9
-rw-r--r--arch/nios2/Kconfig1
-rw-r--r--arch/nios2/include/asm/cachetype.h10
-rw-r--r--arch/nios2/include/asm/pgtable.h2
-rw-r--r--arch/parisc/Kconfig1
-rw-r--r--arch/parisc/include/asm/cachetype.h9
-rw-r--r--arch/powerpc/include/asm/pgtable.h2
-rw-r--r--arch/powerpc/include/asm/tlb.h2
-rw-r--r--arch/powerpc/mm/hugetlbpage.c4
-rw-r--r--arch/powerpc/mm/mmu_decl.h6
-rw-r--r--arch/powerpc/mm/pgtable.c5
-rw-r--r--arch/powerpc/mm/pgtable_32.c4
-rw-r--r--arch/powerpc/mm/pgtable_64.c3
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.c21
-rw-r--r--arch/riscv/include/asm/pgtable.h2
-rw-r--r--arch/riscv/include/asm/ptdump.h22
-rw-r--r--arch/riscv/mm/init.c3
-rw-r--r--arch/riscv/mm/ptdump.c12
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/pgtable.h2
-rw-r--r--arch/s390/include/asm/ptdump.h14
-rw-r--r--arch/s390/include/asm/tlb.h30
-rw-r--r--arch/s390/mm/dump_pagetables.c21
-rw-r--r--arch/s390/mm/init.c5
-rw-r--r--arch/s390/mm/pgtable.c4
-rw-r--r--arch/s390/mm/vmem.c62
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/include/asm/cachetype.h9
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/asm/cachetype.h14
-rw-r--r--arch/sparc/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/mmu.h2
-rw-r--r--arch/x86/include/asm/pgtable.h13
-rw-r--r--arch/x86/kernel/alternative.c2
-rw-r--r--arch/x86/mm/dump_pagetables.c24
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/tlb.c37
-rw-r--r--arch/x86/xen/mmu_pv.c2
-rw-r--r--arch/xtensa/Kconfig1
-rw-r--r--arch/xtensa/include/asm/cachetype.h10
-rw-r--r--drivers/base/cacheinfo.c50
-rw-r--r--drivers/base/memory.c23
-rw-r--r--drivers/block/zram/zcomp.c5
-rw-r--r--drivers/block/zram/zram_drv.c2
-rw-r--r--drivers/cpuidle/cpuidle.c2
-rw-r--r--drivers/dax/bus.c293
-rw-r--r--drivers/dax/super.c14
-rw-r--r--drivers/md/dm.c17
-rw-r--r--drivers/nvdimm/pmem.c23
-rw-r--r--drivers/s390/block/dcssblk.c11
-rw-r--r--drivers/s390/char/sclp_cmd.c44
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/fuse/virtio_fs.c15
-rw-r--r--fs/proc/task_mmu.c17
-rw-r--r--fs/userfaultfd.c86
-rw-r--r--include/asm-generic/tlb.h44
-rw-r--r--include/linux/cacheinfo.h6
-rw-r--r--include/linux/cma.h6
-rw-r--r--include/linux/dax.h17
-rw-r--r--include/linux/efi.h5
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/highmem.h14
-rw-r--r--include/linux/list_lru.h20
-rw-r--r--include/linux/memcontrol.h1
-rw-r--r--include/linux/memory.h9
-rw-r--r--include/linux/memory_hotplug.h24
-rw-r--r--include/linux/memremap.h3
-rw-r--r--include/linux/mm.h17
-rw-r--r--include/linux/mm_types.h37
-rw-r--r--include/linux/mmu_context.h2
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--include/linux/page_counter.h2
-rw-r--r--include/linux/pgtable.h135
-rw-r--r--include/linux/ptdump.h10
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/sched/mm.h55
-rw-r--r--include/linux/swapops.h13
-rw-r--r--include/linux/userfaultfd_k.h75
-rw-r--r--include/linux/zswap.h11
-rw-r--r--include/trace/events/compaction.h6
-rw-r--r--include/uapi/linux/mempolicy.h1
-rw-r--r--init/main.c2
-rw-r--r--kernel/dma/contiguous.c6
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--lib/maple_tree.c6
-rw-r--r--lib/test_xarray.c230
-rw-r--r--mm/Kconfig37
-rw-r--r--mm/cma.c28
-rw-r--r--mm/cma.h5
-rw-r--r--mm/cma_sysfs.c15
-rw-r--r--mm/compaction.c81
-rw-r--r--mm/damon/Kconfig7
-rw-r--r--mm/damon/dbgfs.c26
-rw-r--r--mm/damon/sysfs-common.h2
-rw-r--r--mm/damon/sysfs-schemes.c22
-rw-r--r--mm/damon/sysfs.c21
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/huge_memory.c191
-rw-r--r--mm/hugetlb.c64
-rw-r--r--mm/internal.h9
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/kasan/kasan_test.c82
-rw-r--r--mm/kasan/kasan_test_module.c4
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/khugepaged.c4
-rw-r--r--mm/kmsan/hooks.c36
-rw-r--r--mm/list_lru.c20
-rw-r--r--mm/memcontrol.c97
-rw-r--r--mm/memory-tiers.c26
-rw-r--r--mm/memory.c446
-rw-r--r--mm/memory_hotplug.c34
-rw-r--r--mm/mempolicy.c492
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mmap.c110
-rw-r--r--mm/mmu_gather.c111
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/page_alloc.c39
-rw-r--r--mm/ptdump.c22
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap.c12
-rw-r--r--mm/swap_slots.c3
-rw-r--r--mm/swap_state.c15
-rw-r--r--mm/swapfile.c29
-rw-r--r--mm/userfaultfd.c484
-rw-r--r--mm/vmscan.c165
-rw-r--r--mm/zswap.c1697
-rw-r--r--scripts/gdb/linux/vmalloc.py56
-rw-r--r--tools/mm/Makefile9
-rw-r--r--tools/mm/thpmaps675
-rw-r--r--tools/testing/selftests/cgroup/test_zswap.c122
-rw-r--r--tools/testing/selftests/damon/.gitignore2
-rw-r--r--tools/testing/selftests/damon/Makefile5
-rw-r--r--tools/testing/selftests/damon/_chk_dependency.sh20
-rw-r--r--tools/testing/selftests/damon/_damon_sysfs.py77
-rw-r--r--tools/testing/selftests/damon/_debugfs_common.sh7
-rw-r--r--tools/testing/selftests/damon/damos_apply_interval.py67
-rw-r--r--tools/testing/selftests/damon/damos_quota.py67
-rwxr-xr-xtools/testing/selftests/damon/debugfs_empty_targets.sh12
-rw-r--r--tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c68
-rw-r--r--tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh22
-rw-r--r--tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c80
-rw-r--r--tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh14
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c10
-rw-r--r--tools/testing/selftests/mm/.gitignore1
-rw-r--r--tools/testing/selftests/mm/Makefile1
-rw-r--r--tools/testing/selftests/mm/compaction_test.c37
-rw-r--r--tools/testing/selftests/mm/hugetlb_madv_vs_map.c124
-rw-r--r--tools/testing/selftests/mm/ksm_functional_tests.c4
-rw-r--r--tools/testing/selftests/mm/map_fixed_noreplace.c96
-rw-r--r--tools/testing/selftests/mm/map_hugetlb.c42
-rw-r--r--tools/testing/selftests/mm/map_populate.c37
-rw-r--r--tools/testing/selftests/mm/mlock-random-test.c136
-rw-r--r--tools/testing/selftests/mm/mlock2-tests.c282
-rw-r--r--tools/testing/selftests/mm/mlock2.h11
-rw-r--r--tools/testing/selftests/mm/mrelease_test.c80
-rw-r--r--tools/testing/selftests/mm/mremap_dontunmap.c32
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh12
-rw-r--r--tools/testing/selftests/mm/split_huge_page_test.c161
-rw-r--r--tools/testing/selftests/mm/thuge-gen.c147
-rw-r--r--tools/testing/selftests/mm/transhuge-stress.c36
-rw-r--r--tools/testing/selftests/mm/virtual_address_range.c44
-rw-r--r--tools/testing/selftests/mm/vm_util.c6
212 files changed, 7244 insertions, 3113 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax
new file mode 100644
index 000000000000..b34266bfae49
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -0,0 +1,153 @@
+What: /sys/bus/dax/devices/daxX.Y/align
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RW) Provides a way to specify an alignment for a dax device.
+ Values allowed are constrained by the physical address ranges
+ that back the dax device, and also by arch requirements.
+
+What: /sys/bus/dax/devices/daxX.Y/mapping
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (WO) Provides a way to allocate a mapping range under a dax
+ device. Specified in the format <start>-<end>.
+
+What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/start
+What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/end
+What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/page_offset
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) A dax device may have multiple constituent discontiguous
+ address ranges. These are represented by the different
+ 'mappingX' subdirectories. The 'start' attribute indicates the
+ start physical address for the given range. The 'end' attribute
+ indicates the end physical address for the given range. The
+ 'page_offset' attribute indicates the offset of the current
+ range in the dax device.
+
+What: /sys/bus/dax/devices/daxX.Y/resource
+Date: June, 2019
+KernelVersion: v5.3
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The resource attribute indicates the starting physical
+ address of a dax device. In case of a device with multiple
+ constituent ranges, it indicates the starting address of the
+ first range.
+
+What: /sys/bus/dax/devices/daxX.Y/size
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RW) The size attribute indicates the total size of a dax
+ device. For creating subdivided dax devices, or for resizing
+ an existing device, the new size can be written to this as
+ part of the reconfiguration process.
+
+What: /sys/bus/dax/devices/daxX.Y/numa_node
+Date: November, 2019
+KernelVersion: v5.5
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) If NUMA is enabled and the platform has affinitized the
+ backing device for this dax device, emit the CPU node
+ affinity for this device.
+
+What: /sys/bus/dax/devices/daxX.Y/target_node
+Date: February, 2019
+KernelVersion: v5.1
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The target-node attribute is the Linux numa-node that a
+ device-dax instance may create when it is online. Prior to
+ being online the device's 'numa_node' property reflects the
+ closest online cpu node which is the typical expectation of a
+ device 'numa_node'. Once it is online it becomes its own
+ distinct numa node.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/available_size
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The available_size attribute tracks available dax region
+ capacity. This only applies to volatile hmem devices, not pmem
+ devices, since pmem devices are defined by nvdimm namespace
+ boundaries.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/size
+Date: July, 2017
+KernelVersion: v5.1
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The size attribute indicates the size of a given dax region
+ in bytes.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/align
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The align attribute indicates alignment of the dax region.
+ Changes on align may not always be valid, when say certain
+ mappings were created with 2M and then we switch to 1G. This
+ validates all ranges against the new value being attempted, post
+ resizing.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/seed
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The seed device is a concept for dynamic dax regions to be
+ able to split the region amongst multiple sub-instances. The
+ seed device, similar to libnvdimm seed devices, is a device
+ that starts with zero capacity allocated and unbound to a
+ driver.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/create
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RW) The create interface to the dax region provides a way to
+ create a new unconfigured dax device under the given region, which
+ can then be configured (with a size etc.) and then probed.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/delete
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (WO) The delete interface for a dax region provides for deletion
+ of any 0-sized and idle dax devices.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/id
+Date: July, 2017
+KernelVersion: v5.1
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The id attribute indicates the region id of a dax region.
+
+What: /sys/bus/dax/devices/daxX.Y/memmap_on_memory
+Date: January, 2024
+KernelVersion: v6.8
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RW) Control the memmap_on_memory setting if the dax device
+ were to be hotplugged as system memory. This determines whether
+ the 'altmap' for the hotplugged memory will be placed on the
+ device being hotplugged (memmap_on_memory=1) or if it will be
+ placed on regular memory (memmap_on_memory=0). This attribute
+ must be set before the device is handed over to the 'kmem'
+ driver (i.e. hotplugged into system-ram). Additionally, this
+ depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
+ memmap_on_memory parameter for memory_hotplug. This is
+ typically set on the kernel command line -
+ memory_hotplug.memmap_on_memory set to 'true' or 'force'."
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
index 02b2bb60c296..dfd755201142 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-cma
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma
@@ -23,3 +23,9 @@ Date: Feb 2021
Contact: Minchan Kim <minchan@kernel.org>
Description:
the number of pages CMA API failed to allocate
+
+What: /sys/kernel/mm/cma/<cma-heap-name>/release_pages_success
+Date: Feb 2024
+Contact: Anshuman Khandual <anshuman.khandual@arm.com>
+Description:
+ the number of pages CMA API succeeded to release
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
new file mode 100644
index 000000000000..8ac327fd7fb6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
@@ -0,0 +1,4 @@
+What: /sys/kernel/mm/mempolicy/
+Date: January 2024
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Interface for Mempolicy
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
new file mode 100644
index 000000000000..0b7972de04e9
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
@@ -0,0 +1,25 @@
+What: /sys/kernel/mm/mempolicy/weighted_interleave/
+Date: January 2024
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Configuration Interface for the Weighted Interleave policy
+
+What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN
+Date: January 2024
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Weight configuration interface for nodeN
+
+ The interleave weight for a memory node (N). These weights are
+ utilized by tasks which have set their mempolicy to
+ MPOL_WEIGHTED_INTERLEAVE.
+
+ These weights only affect new allocations, and changes at runtime
+ will not cause migrations on already allocated pages.
+
+ The minimum weight for a node is always 1.
+
+ Minimum weight: 1
+ Maximum weight: 255
+
+ Writing an empty string or `0` will reset the weight to the
+ system default. The system default may be set by the kernel
+ or drivers at boot or during hotplug events.
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 9d23144bf985..22254997723c 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -180,19 +180,14 @@ In each context directory, two files (``avail_operations`` and ``operations``)
and three directories (``monitoring_attrs``, ``targets``, and ``schemes``)
exist.
-DAMON supports multiple types of monitoring operations, including those for
-virtual address space and the physical address space. You can get the list of
-available monitoring operations set on the currently running kernel by reading
+DAMON supports multiple types of :ref:`monitoring operations
+<damon_design_configurable_operations_set>`, including those for virtual address
+space and the physical address space. You can get the list of available
+monitoring operations set on the currently running kernel by reading
``avail_operations`` file. Based on the kernel configuration, the file will
-list some or all of below keywords.
-
- - vaddr: Monitor virtual address spaces of specific processes
- - fvaddr: Monitor fixed virtual address ranges
- - paddr: Monitor the physical address space of the system
-
-Please refer to :ref:`regions sysfs directory <sysfs_regions>` for detailed
-differences between the operations sets in terms of the monitoring target
-regions.
+list different available operation sets. Please refer to the :ref:`design
+<damon_operations_set>` for the list of all available operation sets and their
+brief explanations.
You can set and get what type of monitoring operations DAMON will use for the
context by writing one of the keywords listed in ``avail_operations`` file and
@@ -247,17 +242,11 @@ process to the ``pid_target`` file.
targets/<N>/regions
-------------------
-When ``vaddr`` monitoring operations set is being used (``vaddr`` is written to
-the ``contexts/<N>/operations`` file), DAMON automatically sets and updates the
-monitoring target regions so that entire memory mappings of target processes
-can be covered. However, users could want to set the initial monitoring region
-to specific address ranges.
-
-In contrast, DAMON do not automatically sets and updates the monitoring target
-regions when ``fvaddr`` or ``paddr`` monitoring operations sets are being used
-(``fvaddr`` or ``paddr`` have written to the ``contexts/<N>/operations``).
-Therefore, users should set the monitoring target regions by themselves in the
-cases.
+In case of ``fvaddr`` or ``paddr`` monitoring operations sets, users are
+required to set the monitoring target address ranges. In case of ``vaddr``
+operations set, it is not mandatory, but users can optionally set the initial
+monitoring region to specific address ranges. Please refer to the :ref:`design
+<damon_design_vaddr_target_regions_construction>` for more details.
For such cases, users can explicitly set the initial monitoring target regions
as they want, by writing proper values to the files under this directory.
@@ -302,27 +291,8 @@ In each scheme directory, five directories (``access_pattern``, ``quotas``,
The ``action`` file is for setting and getting the scheme's :ref:`action
<damon_design_damos_action>`. The keywords that can be written to and read
-from the file and their meaning are as below.
-
-Note that support of each action depends on the running DAMON operations set
-:ref:`implementation <sysfs_context>`.
-
- - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``.
- Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``.
- Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
- Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set.
- - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.
- Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.
- Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``lru_prio``: Prioritize the region on its LRU lists.
- Supported by ``paddr`` operations set.
- - ``lru_deprio``: Deprioritize the region on its LRU lists.
- Supported by ``paddr`` operations set.
- - ``stat``: Do nothing but count the statistics.
- Supported by all operations sets.
+from the file and their meaning are same to those of the list on
+:ref:`design doc <damon_design_damos_action>`.
The ``apply_interval_us`` file is for setting and getting the scheme's
:ref:`apply_interval <damon_design_damos>` in microseconds.
@@ -359,7 +329,8 @@ respectively. Then, DAMON tries to use only up to ``time quota`` milliseconds
for applying the ``action`` to memory regions of the ``access_pattern``, and to
apply the action to only up to ``bytes`` bytes of memory regions within the
``reset_interval_ms``. Setting both ``ms`` and ``bytes`` zero disables the
-quota limits.
+quota limits unless at least one :ref:`goal <sysfs_schemes_quota_goals>` is
+set.
Under ``weights`` directory, three files (``sz_permil``,
``nr_accesses_permil``, and ``age_permil``) exist.
@@ -579,11 +550,11 @@ monitoring results recording.
While the monitoring is turned on, you could record the tracepoint events and
show results using tracepoint supporting tools like ``perf``. For example::
- # echo on > monitor_on
+ # echo on > kdamonds/0/state
# perf record -e damon:damon_aggregated &
# sleep 5
# kill 9 $(pidof perf)
- # echo off > monitor_on
+ # echo off > kdamonds/0/state
# perf script
kdamond.0 46568 [027] 79357.842179: damon:damon_aggregated: target_id=0 nr_regions=11 122509119488-135708762112: 0 864
[...]
@@ -628,9 +599,17 @@ debugfs Interface (DEPRECATED!)
move, please report your usecase to damon@lists.linux.dev and
linux-mm@kvack.org.
-DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
-``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
+DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``,
+``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``,
+``mk_contexts`` and ``rm_contexts`` under its debugfs directory,
+``<debugfs>/damon/``.
+
+
+``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation
+notice. Reading it returns the deprecation notice, as below::
+
+ # cat DEPRECATED
+ DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
Attributes
@@ -755,19 +734,17 @@ Action
~~~~~~
The ``<action>`` is a predefined integer for memory management :ref:`actions
-<damon_design_damos_action>`. The supported numbers and their meanings are as
-below.
-
- - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``. Ignored if
- ``target`` is ``paddr``.
- - 1: Call ``madvise()`` for the region with ``MADV_COLD``. Ignored if
- ``target`` is ``paddr``.
- - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
- - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Ignored if
- ``target`` is ``paddr``.
- - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. Ignored if
- ``target`` is ``paddr``.
- - 5: Do nothing but count the statistics
+<damon_design_damos_action>`. The mapping between the ``<action>`` values and
+the memory management actions is as below. For the detailed meaning of the
+action and DAMON operations set supporting each action, please refer to the
+list on :ref:`design doc <damon_design_damos_action>`.
+
+ - 0: ``willneed``
+ - 1: ``cold``
+ - 2: ``pageout``
+ - 3: ``hugepage``
+ - 4: ``nohugepage``
+ - 5: ``stat``
Quota
~~~~~
@@ -848,16 +825,16 @@ Turning On/Off
Setting the files as described above doesn't incur effect unless you explicitly
start the monitoring. You can start, stop, and check the current status of the
-monitoring by writing to and reading from the ``monitor_on`` file. Writing
-``on`` to the file starts the monitoring of the targets with the attributes.
-Writing ``off`` to the file stops those. DAMON also stops if every target
-process is terminated. Below example commands turn on, off, and check the
-status of DAMON::
+monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file.
+Writing ``on`` to the file starts the monitoring of the targets with the
+attributes. Writing ``off`` to the file stops those. DAMON also stops if
+every target process is terminated. Below example commands turn on, off, and
+check the status of DAMON::
# cd <debugfs>/damon
- # echo on > monitor_on
- # echo off > monitor_on
- # cat monitor_on
+ # echo on > monitor_on_DEPRECATED
+ # echo off > monitor_on_DEPRECATED
+ # cat monitor_on_DEPRECATED
off
Please note that you cannot write to the above-mentioned debugfs files while
@@ -873,11 +850,11 @@ can get the pid of the thread by reading the ``kdamond_pid`` file. When the
monitoring is turned off, reading the file returns ``none``. ::
# cd <debugfs>/damon
- # cat monitor_on
+ # cat monitor_on_DEPRECATED
off
# cat kdamond_pid
none
- # echo on > monitor_on
+ # echo on > monitor_on_DEPRECATED
# cat kdamond_pid
18594
@@ -907,5 +884,5 @@ directory by putting the name of the context to the ``rm_contexts`` file. ::
# ls foo
# ls: cannot access 'foo': No such file or directory
-Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the
-root directory only.
+Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files
+are in the root directory only.
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index eca38fa81e0f..a70f20ce1ffb 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -250,6 +250,15 @@ MPOL_PREFERRED_MANY
can fall back to all existing numa nodes. This is effectively
MPOL_PREFERRED allowed for a mask rather than a single node.
+MPOL_WEIGHTED_INTERLEAVE
+ This mode operates the same as MPOL_INTERLEAVE, except that
+ interleaving behavior is executed based on weights set in
+ /sys/kernel/mm/mempolicy/weighted_interleave/
+
+ Weighted interleave allocates pages on nodes according to a
+ weight. For example if nodes [0,1] are weighted [5,2], 5 pages
+ will be allocated on node0 for every 2 pages allocated on node1.
+
NUMA memory policy supports the following optional mode flags:
MPOL_F_STATIC_NODES
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 858c77fe7dc4..a5a6dbe9029f 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -169,7 +169,7 @@ Error reports
A typical KASAN report looks like this::
==================================================================
- BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan]
+ BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test]
Write of size 1 at addr ffff8801f44ec37b by task insmod/2760
CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698
@@ -179,8 +179,8 @@ A typical KASAN report looks like this::
print_address_description+0x73/0x280
kasan_report+0x144/0x187
__asan_report_store1_noabort+0x17/0x20
- kmalloc_oob_right+0xa8/0xbc [test_kasan]
- kmalloc_tests_init+0x16/0x700 [test_kasan]
+ kmalloc_oob_right+0xa8/0xbc [kasan_test]
+ kmalloc_tests_init+0x16/0x700 [kasan_test]
do_one_initcall+0xa5/0x3ae
do_init_module+0x1b6/0x547
load_module+0x75df/0x8070
@@ -200,8 +200,8 @@ A typical KASAN report looks like this::
save_stack+0x43/0xd0
kasan_kmalloc+0xa7/0xd0
kmem_cache_alloc_trace+0xe1/0x1b0
- kmalloc_oob_right+0x56/0xbc [test_kasan]
- kmalloc_tests_init+0x16/0x700 [test_kasan]
+ kmalloc_oob_right+0x56/0xbc [kasan_test]
+ kmalloc_tests_init+0x16/0x700 [kasan_test]
do_one_initcall+0xa5/0x3ae
do_init_module+0x1b6/0x547
load_module+0x75df/0x8070
@@ -510,15 +510,15 @@ When a test passes::
When a test fails due to a failed ``kmalloc``::
- # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163
+ # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245
Expected ptr is not null, but is
- not ok 4 - kmalloc_large_oob_right
+ not ok 5 - kmalloc_large_oob_right
When a test fails due to a missing KASAN report::
- # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974
+ # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709
KASAN failure expected in "kfree_sensitive(ptr)", but none occurred
- not ok 44 - kmalloc_double_kzfree
+ not ok 28 - kmalloc_double_kzfree
At the end the cumulative status of all KASAN tests is printed. On success::
@@ -534,7 +534,7 @@ There are a few ways to run KUnit-compatible KASAN tests.
1. Loadable module
With ``CONFIG_KUNIT`` enabled, KASAN-KUnit tests can be built as a loadable
- module and run by loading ``test_kasan.ko`` with ``insmod`` or ``modprobe``.
+ module and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``.
2. Built-In
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 1bb69524a62e..2bd0c203dcfb 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -31,6 +31,8 @@ DAMON subsystem is configured with three layers including
interfaces for the user space, on top of the core layer.
+.. _damon_design_configurable_operations_set:
+
Configurable Operations Set
---------------------------
@@ -63,6 +65,8 @@ modules that built on top of the core layer using the API, which can be easily
used by the user space end users.
+.. _damon_operations_set:
+
Operations Set Layer
====================
@@ -71,16 +75,26 @@ The monitoring operations are defined in two parts:
1. Identification of the monitoring target address range for the address space.
2. Access check of specific address range in the target space.
-DAMON currently provides the implementations of the operations for the physical
-and virtual address spaces. Below two subsections describe how those work.
+DAMON currently provides below three operation sets. Below two subsections
+describe how those work.
+
+ - vaddr: Monitor virtual address spaces of specific processes
+ - fvaddr: Monitor fixed virtual address ranges
+ - paddr: Monitor the physical address space of the system
+ .. _damon_design_vaddr_target_regions_construction:
+
VMA-based Target Address Range Construction
-------------------------------------------
-This is only for the virtual address space monitoring operations
-implementation. That for the physical address space simply asks users to
-manually set the monitoring target address ranges.
+A mechanism of ``vaddr`` DAMON operations set that automatically initializes
+and updates the monitoring target address regions so that entire memory
+mappings of the target processes can be covered.
+
+This mechanism is only for the ``vaddr`` operations set. In cases of
+``fvaddr`` and ``paddr`` operation sets, users are asked to manually set the
+monitoring target address ranges.
Only small parts in the super-huge virtual address space of the processes are
mapped to the physical memory and accessed. Thus, tracking the unmapped
@@ -294,9 +308,29 @@ not mandated to support all actions of the list. Hence, the availability of
specific DAMOS action depends on what operations set is selected to be used
together.
-Applying an action to a region is considered as changing the region's
-characteristics. Hence, DAMOS resets the age of regions when an action is
-applied to those.
+The list of the supported actions, their meaning, and DAMON operations sets
+that supports each action are as below.
+
+ - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``.
+ Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``.
+ Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
+ Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set.
+ - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.
+ Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.
+ Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``lru_prio``: Prioritize the region on its LRU lists.
+ Supported by ``paddr`` operations set.
+ - ``lru_deprio``: Deprioritize the region on its LRU lists.
+ Supported by ``paddr`` operations set.
+ - ``stat``: Do nothing but count the statistics.
+ Supported by all operations sets.
+
+Applying the actions except ``stat`` to a region is considered as changing the
+region's characteristics. Hence, DAMOS resets the age of regions when any such
+actions are applied to those.
.. _damon_design_damos_access_pattern:
diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst
index a84c14e59053..5a306e4de22e 100644
--- a/Documentation/mm/damon/maintainer-profile.rst
+++ b/Documentation/mm/damon/maintainer-profile.rst
@@ -21,8 +21,8 @@ be queued in mm-stable [3]_ , and finally pull-requested to the mainline by the
memory management subsystem maintainer.
Note again the patches for review should be made against the mm-unstable
-tree[1] whenever possible. damon/next is only for preview of others' works in
-progress.
+tree [1]_ whenever possible. damon/next is only for preview of others' works
+in progress.
Submit checklist addendum
-------------------------
@@ -41,8 +41,8 @@ Further doing below and putting the results will be helpful.
Key cycle dates
---------------
-Patches can be sent anytime. Key cycle dates of the mm-unstable[1] and
-mm-stable[3] trees depend on the memory management subsystem maintainer.
+Patches can be sent anytime. Key cycle dates of the mm-unstable [1]_ and
+mm-stable [3]_ trees depend on the memory management subsystem maintainer.
Review cadence
--------------
diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
index 17b9949d9b43..da2745464ece 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
@@ -344,7 +344,7 @@ debugfs接å£
:ref:`sysfs接å£<sysfs_interface>`。
DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和
+``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
@@ -521,15 +521,15 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
开关
----
-除éžä½ æ˜Žç¡®åœ°å¯åŠ¨ç›‘测,å¦åˆ™å¦‚上所述的文件设置ä¸ä¼šäº§ç”Ÿæ•ˆæžœã€‚ä½ å¯ä»¥é€šè¿‡å†™å…¥å’Œè¯»å– ``monitor_on``
+除éžä½ æ˜Žç¡®åœ°å¯åŠ¨ç›‘测,å¦åˆ™å¦‚上所述的文件设置ä¸ä¼šäº§ç”Ÿæ•ˆæžœã€‚ä½ å¯ä»¥é€šè¿‡å†™å…¥å’Œè¯»å– ``monitor_on_DEPRECATED``
文件æ¥å¯åŠ¨ã€åœæ­¢å’Œæ£€æŸ¥ç›‘测的当å‰çŠ¶æ€ã€‚写入 ``on`` 该文件å¯ä»¥å¯åŠ¨å¯¹æœ‰å±žæ€§çš„目标的监测。写入
``off`` 该文件则åœæ­¢è¿™äº›ç›®æ ‡ã€‚如果æ¯ä¸ªç›®æ ‡è¿›ç¨‹è¢«ç»ˆæ­¢ï¼ŒDAMON也会åœæ­¢ã€‚下é¢çš„示例命令开å¯ã€å…³
闭和检查DAMON的状æ€::
# cd <debugfs>/damon
- # echo on > monitor_on
- # echo off > monitor_on
- # cat monitor_on
+ # echo on > monitor_on_DEPRECATED
+ # echo off > monitor_on_DEPRECATED
+ # cat monitor_on_DEPRECATED
off
请注æ„,当监测开å¯æ—¶ï¼Œä½ ä¸èƒ½å†™åˆ°ä¸Šè¿°çš„debugfs文件。如果你在DAMONè¿è¡Œæ—¶å†™åˆ°è¿™äº›æ–‡ä»¶ï¼Œå°†ä¼šè¿”
@@ -543,11 +543,11 @@ DAMON通过一个å«åškdamond的内核线程æ¥è¿›è¡Œè¯·æ±‚监测。你å¯ä»¥é€
得该线程的 ``pid`` 。当监测被 ``关闭`` 时,读å–该文件ä¸ä¼šè¿”回任何信æ¯::
# cd <debugfs>/damon
- # cat monitor_on
+ # cat monitor_on_DEPRECATED
off
# cat kdamond_pid
none
- # echo on > monitor_on
+ # echo on > monitor_on_DEPRECATED
# cat kdamond_pid
18594
@@ -574,7 +574,7 @@ DAMON通过一个å«åškdamond的内核线程æ¥è¿›è¡Œè¯·æ±‚监测。你å¯ä»¥é€
# ls foo
# ls: cannot access 'foo': No such file or directory
-注æ„, ``mk_contexts`` 〠``rm_contexts`` å’Œ ``monitor_on`` 文件åªåœ¨æ ¹ç›®å½•ä¸‹ã€‚
+注æ„, ``mk_contexts`` 〠``rm_contexts`` å’Œ ``monitor_on_DEPRECATED`` 文件åªåœ¨æ ¹ç›®å½•ä¸‹ã€‚
监测结果的监测点
@@ -583,9 +583,9 @@ DAMON通过一个å«åškdamond的内核线程æ¥è¿›è¡Œè¯·æ±‚监测。你å¯ä»¥é€
DAMON通过一个tracepoint ``damon:damon_aggregated`` æ供监测结果. 当监测开å¯æ—¶ï¼Œä½ å¯
以记录追踪点事件,并使用追踪点支æŒå·¥å…·å¦‚perf显示结果。比如说::
- # echo on > monitor_on
+ # echo on > monitor_on_DEPRECATED
# perf record -e damon:damon_aggregated &
# sleep 5
# kill 9 $(pidof perf)
- # echo off > monitor_on
+ # echo off > monitor_on_DEPRECATED
# perf script
diff --git a/Documentation/translations/zh_CN/dev-tools/kasan.rst b/Documentation/translations/zh_CN/dev-tools/kasan.rst
index 8fdb20c9665b..2b1e8f74904b 100644
--- a/Documentation/translations/zh_CN/dev-tools/kasan.rst
+++ b/Documentation/translations/zh_CN/dev-tools/kasan.rst
@@ -137,7 +137,7 @@ KASANå—到通用 ``panic_on_warn`` 命令行å‚æ•°çš„å½±å“。当它被å¯ç”¨æ—
典型的KASAN报告如下所示::
==================================================================
- BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan]
+ BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test]
Write of size 1 at addr ffff8801f44ec37b by task insmod/2760
CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698
@@ -147,8 +147,8 @@ KASANå—到通用 ``panic_on_warn`` 命令行å‚æ•°çš„å½±å“。当它被å¯ç”¨æ—
print_address_description+0x73/0x280
kasan_report+0x144/0x187
__asan_report_store1_noabort+0x17/0x20
- kmalloc_oob_right+0xa8/0xbc [test_kasan]
- kmalloc_tests_init+0x16/0x700 [test_kasan]
+ kmalloc_oob_right+0xa8/0xbc [kasan_test]
+ kmalloc_tests_init+0x16/0x700 [kasan_test]
do_one_initcall+0xa5/0x3ae
do_init_module+0x1b6/0x547
load_module+0x75df/0x8070
@@ -168,8 +168,8 @@ KASANå—到通用 ``panic_on_warn`` 命令行å‚æ•°çš„å½±å“。当它被å¯ç”¨æ—
save_stack+0x43/0xd0
kasan_kmalloc+0xa7/0xd0
kmem_cache_alloc_trace+0xe1/0x1b0
- kmalloc_oob_right+0x56/0xbc [test_kasan]
- kmalloc_tests_init+0x16/0x700 [test_kasan]
+ kmalloc_oob_right+0x56/0xbc [kasan_test]
+ kmalloc_tests_init+0x16/0x700 [kasan_test]
do_one_initcall+0xa5/0x3ae
do_init_module+0x1b6/0x547
load_module+0x75df/0x8070
@@ -421,15 +421,15 @@ KASAN连接到vmap基础架构以懒清ç†æœªä½¿ç”¨çš„å½±å­å†…存。
当由于 ``kmalloc`` 失败而导致测试失败时::
- # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163
+ # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245
Expected ptr is not null, but is
- not ok 4 - kmalloc_large_oob_right
+ not ok 5 - kmalloc_large_oob_right
当由于缺少KASAN报告而导致测试失败时::
- # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974
+ # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709
KASAN failure expected in "kfree_sensitive(ptr)", but none occurred
- not ok 44 - kmalloc_double_kzfree
+ not ok 28 - kmalloc_double_kzfree
最åŽæ‰“å°æ‰€æœ‰KASAN测试的累积状æ€ã€‚æˆåŠŸ::
@@ -445,7 +445,7 @@ KASAN连接到vmap基础架构以懒清ç†æœªä½¿ç”¨çš„å½±å­å†…存。
1. å¯åŠ è½½æ¨¡å—
å¯ç”¨ ``CONFIG_KUNIT`` åŽï¼ŒKASAN-KUnit测试å¯ä»¥æž„建为å¯åŠ è½½æ¨¡å—,并通过使用
- ``insmod`` 或 ``modprobe`` 加载 ``test_kasan.ko`` æ¥è¿è¡Œã€‚
+ ``insmod`` 或 ``modprobe`` 加载 ``kasan_test.ko`` æ¥è¿è¡Œã€‚
2. 内置
diff --git a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
index 6dee719a32ea..7464279f9b7d 100644
--- a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
@@ -344,7 +344,7 @@ debugfs接å£
:ref:`sysfs接å£<sysfs_interface>`。
DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和
+``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
@@ -521,15 +521,15 @@ DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``,
開關
----
-除éžä½ æ˜Žç¢ºåœ°å•“動監測,å¦å‰‡å¦‚上所述的文件設置ä¸æœƒç”¢ç”Ÿæ•ˆæžœã€‚ä½ å¯ä»¥é€šéŽå¯«å…¥å’Œè®€å– ``monitor_on``
+除éžä½ æ˜Žç¢ºåœ°å•“動監測,å¦å‰‡å¦‚上所述的文件設置ä¸æœƒç”¢ç”Ÿæ•ˆæžœã€‚ä½ å¯ä»¥é€šéŽå¯«å…¥å’Œè®€å– ``monitor_on_DEPRECATED``
文件來啓動ã€åœæ­¢å’Œæª¢æŸ¥ç›£æ¸¬çš„當å‰ç‹€æ…‹ã€‚寫入 ``on`` 該文件å¯ä»¥å•“å‹•å°æœ‰å±¬æ€§çš„目標的監測。寫入
``off`` 該文件則åœæ­¢é€™äº›ç›®æ¨™ã€‚如果æ¯å€‹ç›®æ¨™é€²ç¨‹è¢«çµ‚止,DAMON也會åœæ­¢ã€‚下é¢çš„示例命令開啓ã€é—œ
閉和檢查DAMON的狀態::
# cd <debugfs>/damon
- # echo on > monitor_on
- # echo off > monitor_on
- # cat monitor_on
+ # echo on > monitor_on_DEPRECATED
+ # echo off > monitor_on_DEPRECATED
+ # cat monitor_on_DEPRECATED
off
請注æ„,當監測開啓時,你ä¸èƒ½å¯«åˆ°ä¸Šè¿°çš„debugfs文件。如果你在DAMONé‹è¡Œæ™‚寫到這些文件,將會返
@@ -543,11 +543,11 @@ DAMON通éŽä¸€å€‹å«åškdamond的內核線程來進行請求監測。你å¯ä»¥é€
得該線程的 ``pid`` 。當監測被 ``關閉`` 時,讀å–該文件ä¸æœƒè¿”回任何信æ¯::
# cd <debugfs>/damon
- # cat monitor_on
+ # cat monitor_on_DEPRECATED
off
# cat kdamond_pid
none
- # echo on > monitor_on
+ # echo on > monitor_on_DEPRECATED
# cat kdamond_pid
18594
@@ -574,7 +574,7 @@ DAMON通éŽä¸€å€‹å«åškdamond的內核線程來進行請求監測。你å¯ä»¥é€
# ls foo
# ls: cannot access 'foo': No such file or directory
-注æ„, ``mk_contexts`` 〠``rm_contexts`` å’Œ ``monitor_on`` 文件åªåœ¨æ ¹ç›®éŒ„下。
+注æ„, ``mk_contexts`` 〠``rm_contexts`` å’Œ ``monitor_on_DEPRECATED`` 文件åªåœ¨æ ¹ç›®éŒ„下。
監測çµæžœçš„監測點
@@ -583,10 +583,10 @@ DAMON通éŽä¸€å€‹å«åškdamond的內核線程來進行請求監測。你å¯ä»¥é€
DAMON通éŽä¸€å€‹tracepoint ``damon:damon_aggregated`` æ供監測çµæžœ. 當監測開啓時,你å¯
以記錄追蹤點事件,並使用追蹤點支æŒå·¥å…·å¦‚perf顯示çµæžœã€‚比如說::
- # echo on > monitor_on
+ # echo on > monitor_on_DEPRECATED
# perf record -e damon:damon_aggregated &
# sleep 5
# kill 9 $(pidof perf)
- # echo off > monitor_on
+ # echo off > monitor_on_DEPRECATED
# perf script
diff --git a/Documentation/translations/zh_TW/dev-tools/kasan.rst b/Documentation/translations/zh_TW/dev-tools/kasan.rst
index 979eb84bc58f..ed342e67d8ed 100644
--- a/Documentation/translations/zh_TW/dev-tools/kasan.rst
+++ b/Documentation/translations/zh_TW/dev-tools/kasan.rst
@@ -137,7 +137,7 @@ KASANå—到通用 ``panic_on_warn`` 命令行åƒæ•¸çš„影響。當它被啓用æ™
典型的KASAN報告如下所示::
==================================================================
- BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan]
+ BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test]
Write of size 1 at addr ffff8801f44ec37b by task insmod/2760
CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698
@@ -147,8 +147,8 @@ KASANå—到通用 ``panic_on_warn`` 命令行åƒæ•¸çš„影響。當它被啓用æ™
print_address_description+0x73/0x280
kasan_report+0x144/0x187
__asan_report_store1_noabort+0x17/0x20
- kmalloc_oob_right+0xa8/0xbc [test_kasan]
- kmalloc_tests_init+0x16/0x700 [test_kasan]
+ kmalloc_oob_right+0xa8/0xbc [kasan_test]
+ kmalloc_tests_init+0x16/0x700 [kasan_test]
do_one_initcall+0xa5/0x3ae
do_init_module+0x1b6/0x547
load_module+0x75df/0x8070
@@ -168,8 +168,8 @@ KASANå—到通用 ``panic_on_warn`` 命令行åƒæ•¸çš„影響。當它被啓用æ™
save_stack+0x43/0xd0
kasan_kmalloc+0xa7/0xd0
kmem_cache_alloc_trace+0xe1/0x1b0
- kmalloc_oob_right+0x56/0xbc [test_kasan]
- kmalloc_tests_init+0x16/0x700 [test_kasan]
+ kmalloc_oob_right+0x56/0xbc [kasan_test]
+ kmalloc_tests_init+0x16/0x700 [kasan_test]
do_one_initcall+0xa5/0x3ae
do_init_module+0x1b6/0x547
load_module+0x75df/0x8070
@@ -421,15 +421,15 @@ KASAN連接到vmap基礎架構以懶清ç†æœªä½¿ç”¨çš„å½±å­å…§å­˜ã€‚
當由於 ``kmalloc`` 失敗而導致測試失敗時::
- # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163
+ # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245
Expected ptr is not null, but is
- not ok 4 - kmalloc_large_oob_right
+ not ok 5 - kmalloc_large_oob_right
當由於缺少KASAN報告而導致測試失敗時::
- # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974
+ # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709
KASAN failure expected in "kfree_sensitive(ptr)", but none occurred
- not ok 44 - kmalloc_double_kzfree
+ not ok 28 - kmalloc_double_kzfree
最後打å°æ‰€æœ‰KASAN測試的累ç©ç‹€æ…‹ã€‚æˆåŠŸ::
@@ -445,7 +445,7 @@ KASAN連接到vmap基礎架構以懶清ç†æœªä½¿ç”¨çš„å½±å­å…§å­˜ã€‚
1. å¯åŠ è¼‰æ¨¡å¡Š
啓用 ``CONFIG_KUNIT`` 後,KASAN-KUnit測試å¯ä»¥æ§‹å»ºçˆ²å¯åŠ è¼‰æ¨¡å¡Šï¼Œä¸¦é€šéŽä½¿ç”¨
- ``insmod`` 或 ``modprobe`` 加載 ``test_kasan.ko`` 來é‹è¡Œã€‚
+ ``insmod`` 或 ``modprobe`` 加載 ``kasan_test.ko`` 來é‹è¡Œã€‚
2. 內置
diff --git a/MAINTAINERS b/MAINTAINERS
index f7c81cea9b69..f3f5981ced29 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24423,6 +24423,7 @@ F: include/linux/zpool.h
F: include/linux/zswap.h
F: mm/zpool.c
F: mm/zswap.c
+F: tools/testing/selftests/cgroup/test_zswap.c
THE REST
M: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 1b0483c51cc1..7d294a3242a4 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -6,6 +6,7 @@
config ARC
def_bool y
select ARC_TIMERS
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DMA_PREP_COHERENT
diff --git a/arch/arc/include/asm/cachetype.h b/arch/arc/include/asm/cachetype.h
new file mode 100644
index 000000000000..05fc7ed59712
--- /dev/null
+++ b/arch/arc/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_ARC_CACHETYPE_H
+#define __ASM_ARC_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing() true
+
+#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0af6709570d1..66a8e64b226e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -5,6 +5,7 @@ config ARM
select ARCH_32BIT_OFF_T
select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE if HAVE_KRETPROBES && FRAME_POINTER && !ARM_UNWIND
select ARCH_HAS_BINFMT_FLAT
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_CPU_FINALIZE_INIT if MMU
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL if MMU
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index 5fbbac1b708b..f1fc278081d0 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -17,7 +17,7 @@ config ARM_PTDUMP_DEBUGFS
kernel.
If in doubt, say "N"
-config DEBUG_WX
+config ARM_DEBUG_WX
bool "Warn on W+X mappings at boot"
depends on MMU
select ARM_PTDUMP_CORE
diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig
index b3dc0465796f..28b724d59e7e 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -252,7 +252,7 @@ CONFIG_DEBUG_INFO_REDUCED=y
CONFIG_GDB_SCRIPTS=y
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_FS=y
-CONFIG_DEBUG_WX=y
+CONFIG_ARM_DEBUG_WX=y
CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_TIMEOUT=-1
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 3fdf4dbfdea5..61cee1e7ebea 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -302,7 +302,7 @@ CONFIG_DEBUG_INFO_REDUCED=y
CONFIG_GDB_SCRIPTS=y
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_FS=y
-CONFIG_DEBUG_WX=y
+CONFIG_ARM_DEBUG_WX=y
CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_TIMEOUT=-1
diff --git a/arch/arm/include/asm/cachetype.h b/arch/arm/include/asm/cachetype.h
index e8c30430be33..b9dbe1d4c8fe 100644
--- a/arch/arm/include/asm/cachetype.h
+++ b/arch/arm/include/asm/cachetype.h
@@ -20,6 +20,8 @@ extern unsigned int cacheid;
#define icache_is_vipt_aliasing() cacheid_is(CACHEID_VIPT_I_ALIASING)
#define icache_is_pipt() cacheid_is(CACHEID_PIPT)
+#define cpu_dcache_is_aliasing() (cache_is_vivt() || cache_is_vipt_aliasing())
+
/*
* __LINUX_ARM_ARCH__ is the minimum supported CPU architecture
* Mask out support which will never be present on newer CPUs.
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index d657b84b6bf7..be91e376df79 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -209,6 +209,8 @@ static inline void __sync_icache_dcache(pte_t pteval)
extern void __sync_icache_dcache(pte_t pteval);
#endif
+#define PFN_PTE_SHIFT PAGE_SHIFT
+
void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval, unsigned int nr);
#define set_ptes set_ptes
diff --git a/arch/arm/include/asm/ptdump.h b/arch/arm/include/asm/ptdump.h
index aad1d034136c..46a4575146ee 100644
--- a/arch/arm/include/asm/ptdump.h
+++ b/arch/arm/include/asm/ptdump.h
@@ -32,10 +32,10 @@ void ptdump_check_wx(void);
#endif /* CONFIG_ARM_PTDUMP_CORE */
-#ifdef CONFIG_DEBUG_WX
-#define debug_checkwx() ptdump_check_wx()
+#ifdef CONFIG_ARM_DEBUG_WX
+#define arm_debug_checkwx() ptdump_check_wx()
#else
-#define debug_checkwx() do { } while (0)
+#define arm_debug_checkwx() do { } while (0)
#endif
#endif /* __ASM_PTDUMP_H */
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index a42e4cd11db2..4c3d78691279 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -458,7 +458,7 @@ static int __mark_rodata_ro(void *unused)
void mark_rodata_ro(void)
{
stop_machine(__mark_rodata_ro, NULL, NULL);
- debug_checkwx();
+ arm_debug_checkwx();
}
#else
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 674ed71573a8..c24e29c0b9a4 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1814,6 +1814,6 @@ void set_ptes(struct mm_struct *mm, unsigned long addr,
if (--nr == 0)
break;
ptep++;
- pte_val(pteval) += PAGE_SIZE;
+ pteval = pte_next_pfn(pteval);
}
}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d435139..386566138620 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2229,6 +2229,15 @@ config UNWIND_PATCH_PAC_INTO_SCS
select UNWIND_TABLES
select DYNAMIC_SCS
+config ARM64_CONTPTE
+ bool "Contiguous PTE mappings for user memory" if EXPERT
+ depends on TRANSPARENT_HUGEPAGE
+ default y
+ help
+ When enabled, user mappings are configured using the PTE contiguous
+ bit, for any mappings that meet the size and alignment requirements.
+ This reduces TLB pressure and improves performance.
+
endmenu # "Kernel Features"
menu "Boot options"
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 79ce70fbb751..401087e8a43d 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -93,7 +93,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
__pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
#define pte_none(pte) (!pte_val(pte))
-#define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0))
+#define __pte_clear(mm, addr, ptep) \
+ __set_pte(ptep, __pte(0))
#define pte_page(pte) (pfn_to_page(pte_pfn(pte)))
/*
@@ -133,11 +134,15 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
#define pte_valid_not_user(pte) \
((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN))
/*
+ * Returns true if the pte is valid and has the contiguous bit set.
+ */
+#define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte))
+/*
* Could the pte be present in the TLB? We must check mm_tlb_flush_pending
* so that we don't erroneously return false for pages that have been
* remapped as PROT_NONE but are yet to be flushed from the TLB.
* Note that we can't make any assumptions based on the state of the access
- * flag, since ptep_clear_flush_young() elides a DSB when invalidating the
+ * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the
* TLB.
*/
#define pte_accessible(mm, pte) \
@@ -261,7 +266,7 @@ static inline pte_t pte_mkdevmap(pte_t pte)
return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
}
-static inline void set_pte(pte_t *ptep, pte_t pte)
+static inline void __set_pte(pte_t *ptep, pte_t pte)
{
WRITE_ONCE(*ptep, pte);
@@ -275,6 +280,11 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
}
}
+static inline pte_t __ptep_get(pte_t *ptep)
+{
+ return READ_ONCE(*ptep);
+}
+
extern void __sync_icache_dcache(pte_t pteval);
bool pgattr_change_is_safe(u64 old, u64 new);
@@ -302,7 +312,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
if (!IS_ENABLED(CONFIG_DEBUG_VM))
return;
- old_pte = READ_ONCE(*ptep);
+ old_pte = __ptep_get(ptep);
if (!pte_valid(old_pte) || !pte_valid(pte))
return;
@@ -311,7 +321,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
/*
* Check for potential race with hardware updates of the pte
- * (ptep_set_access_flags safely changes valid ptes without going
+ * (__ptep_set_access_flags safely changes valid ptes without going
* through an invalid entry).
*/
VM_WARN_ONCE(!pte_young(pte),
@@ -341,23 +351,38 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages)
mte_sync_tags(pte, nr_pages);
}
-static inline void set_ptes(struct mm_struct *mm,
- unsigned long __always_unused addr,
- pte_t *ptep, pte_t pte, unsigned int nr)
+/*
+ * Select all bits except the pfn
+ */
+static inline pgprot_t pte_pgprot(pte_t pte)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+}
+
+#define pte_advance_pfn pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
+{
+ return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte));
+}
+
+static inline void __set_ptes(struct mm_struct *mm,
+ unsigned long __always_unused addr,
+ pte_t *ptep, pte_t pte, unsigned int nr)
{
page_table_check_ptes_set(mm, ptep, pte, nr);
__sync_cache_and_tags(pte, nr);
for (;;) {
__check_safe_pte_update(mm, ptep, pte);
- set_pte(ptep, pte);
+ __set_pte(ptep, pte);
if (--nr == 0)
break;
ptep++;
- pte_val(pte) += PAGE_SIZE;
+ pte = pte_advance_pfn(pte, 1);
}
}
-#define set_ptes set_ptes
/*
* Huge pte definitions.
@@ -433,16 +458,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
}
-/*
- * Select all bits except the pfn
- */
-static inline pgprot_t pte_pgprot(pte_t pte)
-{
- unsigned long pfn = pte_pfn(pte);
-
- return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
-}
-
#ifdef CONFIG_NUMA_BALANCING
/*
* See the comment in include/linux/pgtable.h
@@ -534,7 +549,7 @@ static inline void __set_pte_at(struct mm_struct *mm,
{
__sync_cache_and_tags(pte, nr);
__check_safe_pte_update(mm, ptep, pte);
- set_pte(ptep, pte);
+ __set_pte(ptep, pte);
}
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
@@ -848,8 +863,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
}
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-extern int ptep_set_access_flags(struct vm_area_struct *vma,
+extern int __ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty);
@@ -859,7 +873,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty)
{
- return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
+ return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
+ pmd_pte(entry), dirty);
}
static inline int pud_devmap(pud_t pud)
@@ -893,12 +908,13 @@ static inline bool pud_user_accessible_page(pud_t pud)
/*
* Atomic pte/pmd modifications.
*/
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static inline int __ptep_test_and_clear_young(pte_t *ptep)
+static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *ptep)
{
pte_t old_pte, pte;
- pte = READ_ONCE(*ptep);
+ pte = __ptep_get(ptep);
do {
old_pte = pte;
pte = pte_mkold(pte);
@@ -909,18 +925,10 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep)
return pte_young(pte);
}
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address,
- pte_t *ptep)
-{
- return __ptep_test_and_clear_young(ptep);
-}
-
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
- int young = ptep_test_and_clear_young(vma, address, ptep);
+ int young = __ptep_test_and_clear_young(vma, address, ptep);
if (young) {
/*
@@ -943,12 +951,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
{
- return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
+ return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
unsigned long address, pte_t *ptep)
{
pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
@@ -958,6 +965,37 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
return pte;
}
+static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ for (;;) {
+ __ptep_get_and_clear(mm, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+
+static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full)
+{
+ pte_t pte, tmp_pte;
+
+ pte = __ptep_get_and_clear(mm, addr, ptep);
+ while (--nr) {
+ ptep++;
+ addr += PAGE_SIZE;
+ tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
+ if (pte_dirty(tmp_pte))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tmp_pte))
+ pte = pte_mkyoung(pte);
+ }
+ return pte;
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
@@ -971,16 +1009,12 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/*
- * ptep_set_wrprotect - mark read-only while trasferring potential hardware
- * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
- */
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
+static inline void ___ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep,
+ pte_t pte)
{
- pte_t old_pte, pte;
+ pte_t old_pte;
- pte = READ_ONCE(*ptep);
do {
old_pte = pte;
pte = pte_wrprotect(pte);
@@ -989,12 +1023,31 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
} while (pte_val(pte) != pte_val(old_pte));
}
+/*
+ * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
+ */
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
+{
+ ___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep));
+}
+
+static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
+ pte_t *ptep, unsigned int nr)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
+ __ptep_set_wrprotect(mm, address, ptep);
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long address, pmd_t *pmdp)
{
- ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
+ __ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
}
#define pmdp_establish pmdp_establish
@@ -1072,7 +1125,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
#endif /* CONFIG_ARM64_MTE */
/*
- * On AArch64, the cache coherency is handled via the set_pte_at() function.
+ * On AArch64, the cache coherency is handled via the __set_ptes() function.
*/
static inline void update_mmu_cache_range(struct vm_fault *vmf,
struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
@@ -1124,6 +1177,282 @@ extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t old_pte, pte_t new_pte);
+
+#ifdef CONFIG_ARM64_CONTPTE
+
+/*
+ * The contpte APIs are used to transparently manage the contiguous bit in ptes
+ * where it is possible and makes sense to do so. The PTE_CONT bit is considered
+ * a private implementation detail of the public ptep API (see below).
+ */
+extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
+extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
+extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr);
+extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full);
+extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full);
+extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr);
+extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty);
+
+static __always_inline void contpte_try_fold(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ /*
+ * Only bother trying if both the virtual and physical addresses are
+ * aligned and correspond to the last entry in a contig range. The core
+ * code mostly modifies ranges from low to high, so this is the likely
+ * the last modification in the contig range, so a good time to fold.
+ * We can't fold special mappings, because there is no associated folio.
+ */
+
+ const unsigned long contmask = CONT_PTES - 1;
+ bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask;
+
+ if (unlikely(valign)) {
+ bool palign = (pte_pfn(pte) & contmask) == contmask;
+
+ if (unlikely(palign &&
+ pte_valid(pte) && !pte_cont(pte) && !pte_special(pte)))
+ __contpte_try_fold(mm, addr, ptep, pte);
+ }
+}
+
+static __always_inline void contpte_try_unfold(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ if (unlikely(pte_valid_cont(pte)))
+ __contpte_try_unfold(mm, addr, ptep, pte);
+}
+
+#define pte_batch_hint pte_batch_hint
+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+ if (!pte_valid_cont(pte))
+ return 1;
+
+ return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1));
+}
+
+/*
+ * The below functions constitute the public API that arm64 presents to the
+ * core-mm to manipulate PTE entries within their page tables (or at least this
+ * is the subset of the API that arm64 needs to implement). These public
+ * versions will automatically and transparently apply the contiguous bit where
+ * it makes sense to do so. Therefore any users that are contig-aware (e.g.
+ * hugetlb, kernel mapper) should NOT use these APIs, but instead use the
+ * private versions, which are prefixed with double underscore. All of these
+ * APIs except for ptep_get_lockless() are expected to be called with the PTL
+ * held. Although the contiguous bit is considered private to the
+ * implementation, it is deliberately allowed to leak through the getters (e.g.
+ * ptep_get()), back to core code. This is required so that pte_leaf_size() can
+ * provide an accurate size for perf_get_pgtable_size(). But this leakage means
+ * its possible a pte will be passed to a setter with the contiguous bit set, so
+ * we explicitly clear the contiguous bit in those cases to prevent accidentally
+ * setting it in the pgtable.
+ */
+
+#define ptep_get ptep_get
+static inline pte_t ptep_get(pte_t *ptep)
+{
+ pte_t pte = __ptep_get(ptep);
+
+ if (likely(!pte_valid_cont(pte)))
+ return pte;
+
+ return contpte_ptep_get(ptep, pte);
+}
+
+#define ptep_get_lockless ptep_get_lockless
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+ pte_t pte = __ptep_get(ptep);
+
+ if (likely(!pte_valid_cont(pte)))
+ return pte;
+
+ return contpte_ptep_get_lockless(ptep);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ /*
+ * We don't have the mm or vaddr so cannot unfold contig entries (since
+ * it requires tlb maintenance). set_pte() is not used in core code, so
+ * this should never even be called. Regardless do our best to service
+ * any call and emit a warning if there is any attempt to set a pte on
+ * top of an existing contig range.
+ */
+ pte_t orig_pte = __ptep_get(ptep);
+
+ WARN_ON_ONCE(pte_valid_cont(orig_pte));
+ __set_pte(ptep, pte_mknoncont(pte));
+}
+
+#define set_ptes set_ptes
+static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr)
+{
+ pte = pte_mknoncont(pte);
+
+ if (likely(nr == 1)) {
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __set_ptes(mm, addr, ptep, pte, 1);
+ contpte_try_fold(mm, addr, ptep, pte);
+ } else {
+ contpte_set_ptes(mm, addr, ptep, pte, nr);
+ }
+}
+
+static inline void pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __pte_clear(mm, addr, ptep);
+}
+
+#define clear_full_ptes clear_full_ptes
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ if (likely(nr == 1)) {
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __clear_full_ptes(mm, addr, ptep, nr, full);
+ } else {
+ contpte_clear_full_ptes(mm, addr, ptep, nr, full);
+ }
+}
+
+#define get_and_clear_full_ptes get_and_clear_full_ptes
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full)
+{
+ pte_t pte;
+
+ if (likely(nr == 1)) {
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+ } else {
+ pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+ }
+
+ return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ return __ptep_get_and_clear(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (likely(!pte_valid_cont(orig_pte)))
+ return __ptep_test_and_clear_young(vma, addr, ptep);
+
+ return contpte_ptep_test_and_clear_young(vma, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (likely(!pte_valid_cont(orig_pte)))
+ return __ptep_clear_flush_young(vma, addr, ptep);
+
+ return contpte_ptep_clear_flush_young(vma, addr, ptep);
+}
+
+#define wrprotect_ptes wrprotect_ptes
+static __always_inline void wrprotect_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned int nr)
+{
+ if (likely(nr == 1)) {
+ /*
+ * Optimization: wrprotect_ptes() can only be called for present
+ * ptes so we only need to check contig bit as condition for
+ * unfold, and we can remove the contig bit from the pte we read
+ * to avoid re-reading. This speeds up fork() which is sensitive
+ * for order-0 folios. Equivalent to contpte_try_unfold().
+ */
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (unlikely(pte_cont(orig_pte))) {
+ __contpte_try_unfold(mm, addr, ptep, orig_pte);
+ orig_pte = pte_mknoncont(orig_pte);
+ }
+ ___ptep_set_wrprotect(mm, addr, ptep, orig_pte);
+ } else {
+ contpte_wrprotect_ptes(mm, addr, ptep, nr);
+ }
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ wrprotect_ptes(mm, addr, ptep, 1);
+}
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+static inline int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ entry = pte_mknoncont(entry);
+
+ if (likely(!pte_valid_cont(orig_pte)))
+ return __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+
+ return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+}
+
+#else /* CONFIG_ARM64_CONTPTE */
+
+#define ptep_get __ptep_get
+#define set_pte __set_pte
+#define set_ptes __set_ptes
+#define pte_clear __pte_clear
+#define clear_full_ptes __clear_full_ptes
+#define get_and_clear_full_ptes __get_and_clear_full_ptes
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define ptep_get_and_clear __ptep_get_and_clear
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young __ptep_test_and_clear_young
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young __ptep_clear_flush_young
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect __ptep_set_wrprotect
+#define wrprotect_ptes __wrprotect_ptes
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags __ptep_set_access_flags
+
+#endif /* CONFIG_ARM64_CONTPTE */
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
index 581caac525b0..5b1701c76d1c 100644
--- a/arch/arm64/include/asm/ptdump.h
+++ b/arch/arm64/include/asm/ptdump.h
@@ -29,13 +29,6 @@ void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
static inline void ptdump_debugfs_register(struct ptdump_info *info,
const char *name) { }
#endif
-void ptdump_check_wx(void);
#endif /* CONFIG_PTDUMP_CORE */
-#ifdef CONFIG_DEBUG_WX
-#define debug_checkwx() ptdump_check_wx()
-#else
-#define debug_checkwx() do { } while (0)
-#endif
-
#endif /* __ASM_PTDUMP_H */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 1deb5d789c2e..3b0e8248e1a4 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -422,7 +422,7 @@ do { \
#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
-static inline void __flush_tlb_range(struct vm_area_struct *vma,
+static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long stride, bool last_level,
int tlb_level)
@@ -456,10 +456,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
__flush_tlb_range_op(vae1is, start, pages, stride, asid,
tlb_level, true, lpa2_is_enabled());
- dsb(ish);
mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
}
+static inline void __flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long stride, bool last_level,
+ int tlb_level)
+{
+ __flush_tlb_range_nosync(vma, start, end, stride,
+ last_level, tlb_level);
+ dsb(ish);
+}
+
static inline void flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 0228001347be..9afcc690fe73 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
{
struct set_perm_data *spd = data;
const efi_memory_desc_t *md = spd->md;
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = __ptep_get(ptep);
if (md->attribute & EFI_MEMORY_RO)
pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
@@ -111,7 +111,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
pte = set_pte_bit(pte, __pgprot(PTE_PXN));
else if (system_supports_bti_kernel() && spd->has_bti)
pte = set_pte_bit(pte, __pgprot(PTE_GP));
- set_pte(ptep, pte);
+ __set_pte(ptep, pte);
return 0;
}
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index a41ef3213e1e..dcdcccd40891 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2)
/*
* If the page content is identical but at least one of the pages is
* tagged, return non-zero to avoid KSM merging. If only one of the
- * pages is tagged, set_pte_at() may zero or change the tags of the
+ * pages is tagged, __set_ptes() may zero or change the tags of the
* other page via mte_sync_tags().
*/
if (page_mte_tagged(page1) || page_mte_tagged(page2))
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index aaf1d4939739..629145fd3161 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1072,7 +1072,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
} else {
/*
* Only locking to serialise with a concurrent
- * set_pte_at() in the VMM but still overriding the
+ * __set_ptes() in the VMM but still overriding the
* tags, hence ignoring the return value.
*/
try_page_mte_tagging(page);
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index dbd1bc95967d..60454256945b 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,6 +3,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
cache.o copypage.o flush.o \
ioremap.o mmap.o pgd.o mmu.o \
context.o proc.o pageattr.o fixmap.o
+obj-$(CONFIG_ARM64_CONTPTE) += contpte.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
new file mode 100644
index 000000000000..16788f07716d
--- /dev/null
+++ b/arch/arm64/mm/contpte.c
@@ -0,0 +1,404 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/export.h>
+#include <asm/tlbflush.h>
+
+static inline bool mm_is_user(struct mm_struct *mm)
+{
+ /*
+ * Don't attempt to apply the contig bit to kernel mappings, because
+ * dynamically adding/removing the contig bit can cause page faults.
+ * These racing faults are ok for user space, since they get serialized
+ * on the PTL. But kernel mappings can't tolerate faults.
+ */
+ if (unlikely(mm_is_efi(mm)))
+ return false;
+ return mm != &init_mm;
+}
+
+static inline pte_t *contpte_align_down(pte_t *ptep)
+{
+ return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
+}
+
+static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ /*
+ * Unfold any partially covered contpte block at the beginning and end
+ * of the range.
+ */
+
+ if (ptep != contpte_align_down(ptep) || nr < CONT_PTES)
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+
+ if (ptep + nr != contpte_align_down(ptep + nr)) {
+ unsigned long last_addr = addr + PAGE_SIZE * (nr - 1);
+ pte_t *last_ptep = ptep + nr - 1;
+
+ contpte_try_unfold(mm, last_addr, last_ptep,
+ __ptep_get(last_ptep));
+ }
+}
+
+static void contpte_convert(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+ unsigned long start_addr;
+ pte_t *start_ptep;
+ int i;
+
+ start_ptep = ptep = contpte_align_down(ptep);
+ start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+ pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte));
+
+ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) {
+ pte_t ptent = __ptep_get_and_clear(mm, addr, ptep);
+
+ if (pte_dirty(ptent))
+ pte = pte_mkdirty(pte);
+
+ if (pte_young(ptent))
+ pte = pte_mkyoung(pte);
+ }
+
+ __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+
+ __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
+}
+
+void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ /*
+ * We have already checked that the virtual and pysical addresses are
+ * correctly aligned for a contpte mapping in contpte_try_fold() so the
+ * remaining checks are to ensure that the contpte range is fully
+ * covered by a single folio, and ensure that all the ptes are valid
+ * with contiguous PFNs and matching prots. We ignore the state of the
+ * access and dirty bits for the purpose of deciding if its a contiguous
+ * range; the folding process will generate a single contpte entry which
+ * has a single access and dirty bit. Those 2 bits are the logical OR of
+ * their respective bits in the constituent pte entries. In order to
+ * ensure the contpte range is covered by a single folio, we must
+ * recover the folio from the pfn, but special mappings don't have a
+ * folio backing them. Fortunately contpte_try_fold() already checked
+ * that the pte is not special - we never try to fold special mappings.
+ * Note we can't use vm_normal_page() for this since we don't have the
+ * vma.
+ */
+
+ unsigned long folio_start, folio_end;
+ unsigned long cont_start, cont_end;
+ pte_t expected_pte, subpte;
+ struct folio *folio;
+ struct page *page;
+ unsigned long pfn;
+ pte_t *orig_ptep;
+ pgprot_t prot;
+
+ int i;
+
+ if (!mm_is_user(mm))
+ return;
+
+ page = pte_page(pte);
+ folio = page_folio(page);
+ folio_start = addr - (page - &folio->page) * PAGE_SIZE;
+ folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE;
+ cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+ cont_end = cont_start + CONT_PTE_SIZE;
+
+ if (folio_start > cont_start || folio_end < cont_end)
+ return;
+
+ pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES);
+ prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+ expected_pte = pfn_pte(pfn, prot);
+ orig_ptep = ptep;
+ ptep = contpte_align_down(ptep);
+
+ for (i = 0; i < CONT_PTES; i++) {
+ subpte = pte_mkold(pte_mkclean(__ptep_get(ptep)));
+ if (!pte_same(subpte, expected_pte))
+ return;
+ expected_pte = pte_advance_pfn(expected_pte, 1);
+ ptep++;
+ }
+
+ pte = pte_mkcont(pte);
+ contpte_convert(mm, addr, orig_ptep, pte);
+}
+EXPORT_SYMBOL(__contpte_try_fold);
+
+void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ /*
+ * We have already checked that the ptes are contiguous in
+ * contpte_try_unfold(), so just check that the mm is user space.
+ */
+ if (!mm_is_user(mm))
+ return;
+
+ pte = pte_mknoncont(pte);
+ contpte_convert(mm, addr, ptep, pte);
+}
+EXPORT_SYMBOL(__contpte_try_unfold);
+
+pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
+{
+ /*
+ * Gather access/dirty bits, which may be populated in any of the ptes
+ * of the contig range. We are guaranteed to be holding the PTL, so any
+ * contiguous range cannot be unfolded or otherwise modified under our
+ * feet.
+ */
+
+ pte_t pte;
+ int i;
+
+ ptep = contpte_align_down(ptep);
+
+ for (i = 0; i < CONT_PTES; i++, ptep++) {
+ pte = __ptep_get(ptep);
+
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+
+ return orig_pte;
+}
+EXPORT_SYMBOL(contpte_ptep_get);
+
+pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
+{
+ /*
+ * Gather access/dirty bits, which may be populated in any of the ptes
+ * of the contig range. We may not be holding the PTL, so any contiguous
+ * range may be unfolded/modified/refolded under our feet. Therefore we
+ * ensure we read a _consistent_ contpte range by checking that all ptes
+ * in the range are valid and have CONT_PTE set, that all pfns are
+ * contiguous and that all pgprots are the same (ignoring access/dirty).
+ * If we find a pte that is not consistent, then we must be racing with
+ * an update so start again. If the target pte does not have CONT_PTE
+ * set then that is considered consistent on its own because it is not
+ * part of a contpte range.
+ */
+
+ pgprot_t orig_prot;
+ unsigned long pfn;
+ pte_t orig_pte;
+ pgprot_t prot;
+ pte_t *ptep;
+ pte_t pte;
+ int i;
+
+retry:
+ orig_pte = __ptep_get(orig_ptep);
+
+ if (!pte_valid_cont(orig_pte))
+ return orig_pte;
+
+ orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte)));
+ ptep = contpte_align_down(orig_ptep);
+ pfn = pte_pfn(orig_pte) - (orig_ptep - ptep);
+
+ for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
+ pte = __ptep_get(ptep);
+ prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+
+ if (!pte_valid_cont(pte) ||
+ pte_pfn(pte) != pfn ||
+ pgprot_val(prot) != pgprot_val(orig_prot))
+ goto retry;
+
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+
+ return orig_pte;
+}
+EXPORT_SYMBOL(contpte_ptep_get_lockless);
+
+void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr)
+{
+ unsigned long next;
+ unsigned long end;
+ unsigned long pfn;
+ pgprot_t prot;
+
+ /*
+ * The set_ptes() spec guarantees that when nr > 1, the initial state of
+ * all ptes is not-present. Therefore we never need to unfold or
+ * otherwise invalidate a range before we set the new ptes.
+ * contpte_set_ptes() should never be called for nr < 2.
+ */
+ VM_WARN_ON(nr == 1);
+
+ if (!mm_is_user(mm))
+ return __set_ptes(mm, addr, ptep, pte, nr);
+
+ end = addr + (nr << PAGE_SHIFT);
+ pfn = pte_pfn(pte);
+ prot = pte_pgprot(pte);
+
+ do {
+ next = pte_cont_addr_end(addr, end);
+ nr = (next - addr) >> PAGE_SHIFT;
+ pte = pfn_pte(pfn, prot);
+
+ if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0)
+ pte = pte_mkcont(pte);
+ else
+ pte = pte_mknoncont(pte);
+
+ __set_ptes(mm, addr, ptep, pte, nr);
+
+ addr = next;
+ ptep += nr;
+ pfn += nr;
+
+ } while (addr != end);
+}
+EXPORT_SYMBOL(contpte_set_ptes);
+
+void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ contpte_try_unfold_partial(mm, addr, ptep, nr);
+ __clear_full_ptes(mm, addr, ptep, nr, full);
+}
+EXPORT_SYMBOL(contpte_clear_full_ptes);
+
+pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full)
+{
+ contpte_try_unfold_partial(mm, addr, ptep, nr);
+ return __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+}
+EXPORT_SYMBOL(contpte_get_and_clear_full_ptes);
+
+int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ /*
+ * ptep_clear_flush_young() technically requires us to clear the access
+ * flag for a _single_ pte. However, the core-mm code actually tracks
+ * access/dirty per folio, not per page. And since we only create a
+ * contig range when the range is covered by a single folio, we can get
+ * away with clearing young for the whole contig range here, so we avoid
+ * having to unfold.
+ */
+
+ int young = 0;
+ int i;
+
+ ptep = contpte_align_down(ptep);
+ addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+
+ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+ young |= __ptep_test_and_clear_young(vma, addr, ptep);
+
+ return young;
+}
+EXPORT_SYMBOL(contpte_ptep_test_and_clear_young);
+
+int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int young;
+
+ young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
+
+ if (young) {
+ /*
+ * See comment in __ptep_clear_flush_young(); same rationale for
+ * eliding the trailing DSB applies here.
+ */
+ addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+ __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
+ PAGE_SIZE, true, 3);
+ }
+
+ return young;
+}
+EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
+
+void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ /*
+ * If wrprotecting an entire contig range, we can avoid unfolding. Just
+ * set wrprotect and wait for the later mmu_gather flush to invalidate
+ * the tlb. Until the flush, the page may or may not be wrprotected.
+ * After the flush, it is guaranteed wrprotected. If it's a partial
+ * range though, we must unfold, because we can't have a case where
+ * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this
+ * would cause it to continue to be unpredictable after the flush.
+ */
+
+ contpte_try_unfold_partial(mm, addr, ptep, nr);
+ __wrprotect_ptes(mm, addr, ptep, nr);
+}
+EXPORT_SYMBOL(contpte_wrprotect_ptes);
+
+int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ unsigned long start_addr;
+ pte_t orig_pte;
+ int i;
+
+ /*
+ * Gather the access/dirty bits for the contiguous range. If nothing has
+ * changed, its a noop.
+ */
+ orig_pte = pte_mknoncont(ptep_get(ptep));
+ if (pte_val(orig_pte) == pte_val(entry))
+ return 0;
+
+ /*
+ * We can fix up access/dirty bits without having to unfold the contig
+ * range. But if the write bit is changing, we must unfold.
+ */
+ if (pte_write(orig_pte) == pte_write(entry)) {
+ /*
+ * For HW access management, we technically only need to update
+ * the flag on a single pte in the range. But for SW access
+ * management, we need to update all the ptes to prevent extra
+ * faults. Avoid per-page tlb flush in __ptep_set_access_flags()
+ * and instead flush the whole range at the end.
+ */
+ ptep = contpte_align_down(ptep);
+ start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+
+ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+ __ptep_set_access_flags(vma, addr, ptep, entry, 0);
+
+ if (dirty)
+ __flush_tlb_range(vma, start_addr, addr,
+ PAGE_SIZE, true, 3);
+ } else {
+ __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
+ __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(contpte_ptep_set_access_flags);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 55f6455a8284..9a1c66183d16 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -191,7 +191,7 @@ static void show_pte(unsigned long addr)
if (!ptep)
break;
- pte = READ_ONCE(*ptep);
+ pte = __ptep_get(ptep);
pr_cont(", pte=%016llx", pte_val(pte));
pte_unmap(ptep);
} while(0);
@@ -205,16 +205,16 @@ static void show_pte(unsigned long addr)
*
* It needs to cope with hardware update of the accessed/dirty state by other
* agents in the system and can safely skip the __sync_icache_dcache() call as,
- * like set_pte_at(), the PTE is never changed from no-exec to exec here.
+ * like __set_ptes(), the PTE is never changed from no-exec to exec here.
*
* Returns whether or not the PTE actually changed.
*/
-int ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep,
- pte_t entry, int dirty)
+int __ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
{
pteval_t old_pteval, pteval;
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = __ptep_get(ptep);
if (pte_same(pte, entry))
return 0;
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index c0a3301203bd..bfc02568805a 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -121,9 +121,9 @@ void __set_fixmap(enum fixed_addresses idx,
ptep = fixmap_pte(addr);
if (pgprot_val(flags)) {
- set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
+ __set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
} else {
- pte_clear(&init_mm, addr, ptep);
+ __pte_clear(&init_mm, addr, ptep);
flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
}
}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 8116ac599f80..0f0e10bb0a95 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -45,13 +45,6 @@ void __init arm64_hugetlb_cma_reserve(void)
else
order = CONT_PMD_SHIFT - PAGE_SHIFT;
- /*
- * HugeTLB CMA reservation is required for gigantic
- * huge pages which could not be allocated via the
- * page allocator. Just warn if there is any change
- * breaking this assumption.
- */
- WARN_ON(order <= MAX_PAGE_ORDER);
hugetlb_cma_reserve(order);
}
#endif /* CONFIG_CMA */
@@ -152,14 +145,14 @@ pte_t huge_ptep_get(pte_t *ptep)
{
int ncontig, i;
size_t pgsize;
- pte_t orig_pte = ptep_get(ptep);
+ pte_t orig_pte = __ptep_get(ptep);
if (!pte_present(orig_pte) || !pte_cont(orig_pte))
return orig_pte;
ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
for (i = 0; i < ncontig; i++, ptep++) {
- pte_t pte = ptep_get(ptep);
+ pte_t pte = __ptep_get(ptep);
if (pte_dirty(pte))
orig_pte = pte_mkdirty(orig_pte);
@@ -184,11 +177,11 @@ static pte_t get_clear_contig(struct mm_struct *mm,
unsigned long pgsize,
unsigned long ncontig)
{
- pte_t orig_pte = ptep_get(ptep);
+ pte_t orig_pte = __ptep_get(ptep);
unsigned long i;
for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
- pte_t pte = ptep_get_and_clear(mm, addr, ptep);
+ pte_t pte = __ptep_get_and_clear(mm, addr, ptep);
/*
* If HW_AFDBM is enabled, then the HW could turn on
@@ -236,7 +229,7 @@ static void clear_flush(struct mm_struct *mm,
unsigned long i, saddr = addr;
for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- ptep_clear(mm, addr, ptep);
+ __ptep_get_and_clear(mm, addr, ptep);
flush_tlb_range(&vma, saddr, addr);
}
@@ -254,12 +247,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
if (!pte_present(pte)) {
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
- set_pte_at(mm, addr, ptep, pte);
+ __set_ptes(mm, addr, ptep, pte, 1);
return;
}
if (!pte_cont(pte)) {
- set_pte_at(mm, addr, ptep, pte);
+ __set_ptes(mm, addr, ptep, pte, 1);
return;
}
@@ -270,7 +263,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
clear_flush(mm, addr, ptep, pgsize, ncontig);
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+ __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
}
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -400,7 +393,7 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
ncontig = num_contig_ptes(sz, &pgsize);
for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- pte_clear(mm, addr, ptep);
+ __pte_clear(mm, addr, ptep);
}
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
@@ -408,10 +401,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
{
int ncontig;
size_t pgsize;
- pte_t orig_pte = ptep_get(ptep);
+ pte_t orig_pte = __ptep_get(ptep);
if (!pte_cont(orig_pte))
- return ptep_get_and_clear(mm, addr, ptep);
+ return __ptep_get_and_clear(mm, addr, ptep);
ncontig = find_num_contig(mm, addr, ptep, &pgsize);
@@ -431,11 +424,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
{
int i;
- if (pte_write(pte) != pte_write(ptep_get(ptep)))
+ if (pte_write(pte) != pte_write(__ptep_get(ptep)))
return 1;
for (i = 0; i < ncontig; i++) {
- pte_t orig_pte = ptep_get(ptep + i);
+ pte_t orig_pte = __ptep_get(ptep + i);
if (pte_dirty(pte) != pte_dirty(orig_pte))
return 1;
@@ -459,7 +452,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
pte_t orig_pte;
if (!pte_cont(pte))
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+ return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
ncontig = find_num_contig(mm, addr, ptep, &pgsize);
dpfn = pgsize >> PAGE_SHIFT;
@@ -478,7 +471,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
hugeprot = pte_pgprot(pte);
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+ __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
return 1;
}
@@ -492,8 +485,8 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
size_t pgsize;
pte_t pte;
- if (!pte_cont(READ_ONCE(*ptep))) {
- ptep_set_wrprotect(mm, addr, ptep);
+ if (!pte_cont(__ptep_get(ptep))) {
+ __ptep_set_wrprotect(mm, addr, ptep);
return;
}
@@ -507,7 +500,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
pfn = pte_pfn(pte);
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+ __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
}
pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -517,7 +510,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
size_t pgsize;
int ncontig;
- if (!pte_cont(READ_ONCE(*ptep)))
+ if (!pte_cont(__ptep_get(ptep)))
return ptep_clear_flush(vma, addr, ptep);
ncontig = find_num_contig(mm, addr, ptep, &pgsize);
@@ -550,7 +543,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr
* when the permission changes from executable to non-executable
* in cases where cpu is affected with errata #2645198.
*/
- if (pte_user_exec(READ_ONCE(*ptep)))
+ if (pte_user_exec(__ptep_get(ptep)))
return huge_ptep_clear_flush(vma, addr, ptep);
}
return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 4c7ad574b946..9ee16cfce587 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -112,8 +112,8 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
if (!early)
memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE);
next = addr + PAGE_SIZE;
- set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
- } while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep)));
+ __set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
+ } while (ptep++, addr = next, addr != end && pte_none(__ptep_get(ptep)));
}
static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
@@ -271,7 +271,7 @@ static void __init kasan_init_shadow(void)
* so we should make sure that it maps the zero page read-only.
*/
for (i = 0; i < PTRS_PER_PTE; i++)
- set_pte(&kasan_early_shadow_pte[i],
+ __set_pte(&kasan_early_shadow_pte[i],
pfn_pte(sym_to_pfn(kasan_early_shadow_page),
PAGE_KERNEL_RO));
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1ac7467d34c9..6208c7541f87 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -173,16 +173,16 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
ptep = pte_set_fixmap_offset(pmdp, addr);
do {
- pte_t old_pte = READ_ONCE(*ptep);
+ pte_t old_pte = __ptep_get(ptep);
- set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
+ __set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
/*
* After the PTE entry has been populated once, we
* only allow updates to the permission attributes.
*/
BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
- READ_ONCE(pte_val(*ptep))));
+ pte_val(__ptep_get(ptep))));
phys += PAGE_SIZE;
} while (ptep++, addr += PAGE_SIZE, addr != end);
@@ -632,8 +632,6 @@ void mark_rodata_ro(void)
section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
section_size, PAGE_KERNEL_RO);
-
- debug_checkwx();
}
static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
@@ -854,12 +852,12 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
do {
ptep = pte_offset_kernel(pmdp, addr);
- pte = READ_ONCE(*ptep);
+ pte = __ptep_get(ptep);
if (pte_none(pte))
continue;
WARN_ON(!pte_present(pte));
- pte_clear(&init_mm, addr, ptep);
+ __pte_clear(&init_mm, addr, ptep);
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
if (free_mapped)
free_hotplug_page_range(pte_page(pte),
@@ -987,7 +985,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
do {
ptep = pte_offset_kernel(pmdp, addr);
- pte = READ_ONCE(*ptep);
+ pte = __ptep_get(ptep);
/*
* This is just a sanity check here which verifies that
@@ -1006,7 +1004,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
*/
ptep = pte_offset_kernel(pmdp, 0UL);
for (i = 0; i < PTRS_PER_PTE; i++) {
- if (!pte_none(READ_ONCE(ptep[i])))
+ if (!pte_none(__ptep_get(&ptep[i])))
return;
}
@@ -1475,7 +1473,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte
* when the permission changes from executable to non-executable
* in cases where cpu is affected with errata #2645198.
*/
- if (pte_user_exec(READ_ONCE(*ptep)))
+ if (pte_user_exec(ptep_get(ptep)))
return ptep_clear_flush(vma, addr, ptep);
}
return ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 924843f1f661..0c4e3ecf989d 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -36,12 +36,12 @@ bool can_set_direct_map(void)
static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
{
struct page_change_data *cdata = data;
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = __ptep_get(ptep);
pte = clear_pte_bit(pte, cdata->clear_mask);
pte = set_pte_bit(pte, cdata->set_mask);
- set_pte(ptep, pte);
+ __set_pte(ptep, pte);
return 0;
}
@@ -245,5 +245,5 @@ bool kernel_page_present(struct page *page)
return true;
ptep = pte_offset_kernel(pmdp, addr);
- return pte_valid(READ_ONCE(*ptep));
+ return pte_valid(__ptep_get(ptep));
}
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index e305b6593c4e..696822f75582 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -345,7 +345,7 @@ static struct ptdump_info kernel_ptdump_info = {
.base_addr = PAGE_OFFSET,
};
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
{
struct pg_state st = {
.seq = NULL,
@@ -366,11 +366,16 @@ void ptdump_check_wx(void)
ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
- if (st.wx_pages || st.uxn_pages)
+ if (st.wx_pages || st.uxn_pages) {
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
st.wx_pages, st.uxn_pages);
- else
+
+ return false;
+ } else {
pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+ return true;
+ }
}
static int __init ptdump_init(void)
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 7b14df3c6477..5139a28130c0 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info)
static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
{
- pte_t pte = READ_ONCE(*src_ptep);
+ pte_t pte = __ptep_get(src_ptep);
if (pte_valid(pte)) {
/*
@@ -41,7 +41,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
* read only (code, rodata). Clear the RDONLY bit from
* the temporary mappings we use during restore.
*/
- set_pte(dst_ptep, pte_mkwrite_novma(pte));
+ __set_pte(dst_ptep, pte_mkwrite_novma(pte));
} else if ((debug_pagealloc_enabled() ||
is_kfence_address((void *)addr)) && !pte_none(pte)) {
/*
@@ -55,7 +55,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
*/
BUG_ON(!pfn_valid(pte_pfn(pte)));
- set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte)));
+ __set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte)));
}
}
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index cf2a6fd7dff8..8a91eccf76dc 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -2,6 +2,7 @@
config CSKY
def_bool y
select ARCH_32BIT_OFF_T
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_SYNC_DMA_FOR_CPU
diff --git a/arch/csky/include/asm/cachetype.h b/arch/csky/include/asm/cachetype.h
new file mode 100644
index 000000000000..98cbe3af662f
--- /dev/null
+++ b/arch/csky/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_CSKY_CACHETYPE_H
+#define __ASM_CSKY_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing() true
+
+#endif
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 4b3e93cac723..a9c3e3de0c6d 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -3,6 +3,7 @@ config M68K
bool
default y
select ARCH_32BIT_OFF_T
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_BINFMT_FLAT
select ARCH_HAS_CPU_FINALIZE_INIT if MMU
select ARCH_HAS_CURRENT_STACK_POINTER
diff --git a/arch/m68k/include/asm/cachetype.h b/arch/m68k/include/asm/cachetype.h
new file mode 100644
index 000000000000..7fad5d9ab8fe
--- /dev/null
+++ b/arch/m68k/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_M68K_CACHETYPE_H
+#define __ASM_M68K_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing() true
+
+#endif
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 797ae590ebdb..ab1c8bd96666 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -4,6 +4,7 @@ config MIPS
default y
select ARCH_32BIT_OFF_T if !64BIT
select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_CURRENT_STACK_POINTER if !CC_IS_CLANG || CLANG_VERSION >= 140000
select ARCH_HAS_DEBUG_VIRTUAL if !64BIT
diff --git a/arch/mips/include/asm/cachetype.h b/arch/mips/include/asm/cachetype.h
new file mode 100644
index 000000000000..9f4ba2fe1155
--- /dev/null
+++ b/arch/mips/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_MIPS_CACHETYPE_H
+#define __ASM_MIPS_CACHETYPE_H
+
+#include <asm/cpu-features.h>
+
+#define cpu_dcache_is_aliasing() cpu_has_dc_aliases
+
+#endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 58d9565dc2c7..6b3a14633d2f 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -2,6 +2,7 @@
config NIOS2
def_bool y
select ARCH_32BIT_OFF_T
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_SYNC_DMA_FOR_CPU
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
diff --git a/arch/nios2/include/asm/cachetype.h b/arch/nios2/include/asm/cachetype.h
new file mode 100644
index 000000000000..eb9c416b8a1c
--- /dev/null
+++ b/arch/nios2/include/asm/cachetype.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_NIOS2_CACHETYPE_H
+#define __ASM_NIOS2_CACHETYPE_H
+
+#include <asm/page.h>
+#include <asm/cache.h>
+
+#define cpu_dcache_is_aliasing() (NIOS2_DCACHE_SIZE > PAGE_SIZE)
+
+#endif
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 5144506dfa69..d052dfcbe8d3 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -178,6 +178,8 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
*ptep = pteval;
}
+#define PFN_PTE_SHIFT 0
+
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr)
{
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 5c845e8d59d9..da6e97ba46a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -8,6 +8,7 @@ config PARISC
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_SYSCALL_TRACEPOINTS
select ARCH_WANT_FRAME_POINTERS
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_DMA_ALLOC if PA11
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/parisc/include/asm/cachetype.h b/arch/parisc/include/asm/cachetype.h
new file mode 100644
index 000000000000..e0868a1d3c47
--- /dev/null
+++ b/arch/parisc/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_PARISC_CACHETYPE_H
+#define __ASM_PARISC_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing() true
+
+#endif
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 9224f23065ff..7a1ba8889aea 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -41,6 +41,8 @@ struct mm_struct;
#ifndef __ASSEMBLY__
+#define PFN_PTE_SHIFT PTE_RPN_SHIFT
+
void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte, unsigned int nr);
#define set_ptes set_ptes
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index b3de6102a907..1ca7d4c4b90d 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -19,6 +19,8 @@
#include <linux/pagemap.h>
+static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
+ unsigned long address);
#define __tlb_remove_tlb_entry __tlb_remove_tlb_entry
#define tlb_flush tlb_flush
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0a540b37aab6..16557d008eef 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -614,8 +614,6 @@ void __init gigantic_hugetlb_cma_reserve(void)
*/
order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
- if (order) {
- VM_WARN_ON(order <= MAX_PAGE_ORDER);
+ if (order)
hugetlb_cma_reserve(order);
- }
}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 72341b9fb552..90dcc2844056 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -171,12 +171,6 @@ static inline void mmu_mark_rodata_ro(void) { }
void __init mmu_mapin_immr(void);
#endif
-#ifdef CONFIG_DEBUG_WX
-void ptdump_check_wx(void);
-#else
-static inline void ptdump_check_wx(void) { }
-#endif
-
static inline bool debug_pagealloc_enabled_or_kfence(void)
{
return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled();
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index a04ae4449a02..549a440ed7f6 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -220,10 +220,7 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
break;
ptep++;
addr += PAGE_SIZE;
- /*
- * increment the pfn.
- */
- pte = pfn_pte(pte_pfn(pte) + 1, pte_pgprot((pte)));
+ pte = pte_next_pfn(pte);
}
}
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 5c02fd08d61e..12498017da8e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -153,7 +153,6 @@ void mark_rodata_ro(void)
if (v_block_mapped((unsigned long)_stext + 1)) {
mmu_mark_rodata_ro();
- ptdump_check_wx();
return;
}
@@ -166,9 +165,6 @@ void mark_rodata_ro(void)
PFN_DOWN((unsigned long)_stext);
set_memory_ro((unsigned long)_stext, numpages);
-
- // mark_initmem_nx() should have already run by now
- ptdump_check_wx();
}
#endif
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 5ac1fd30341b..1b366526f4f2 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -150,9 +150,6 @@ void mark_rodata_ro(void)
radix__mark_rodata_ro();
else
hash__mark_rodata_ro();
-
- // mark_initmem_nx() should have already run by now
- ptdump_check_wx();
}
void mark_initmem_nx(void)
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 2313053fe679..9dc239967b77 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -184,13 +184,14 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
{
pte_t pte = __pte(st->current_flags);
- if (!IS_ENABLED(CONFIG_DEBUG_WX) || !st->check_wx)
+ if (!st->check_wx)
return;
if (!pte_write(pte) || !pte_exec(pte))
return;
- WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
+ WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+ "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
(void *)st->start_address, (void *)st->start_address);
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
@@ -326,8 +327,7 @@ static void __init build_pgtable_complete_mask(void)
pg_level[i].mask |= pg_level[i].flag[j].mask;
}
-#ifdef CONFIG_DEBUG_WX
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
{
struct pg_state st = {
.seq = NULL,
@@ -343,15 +343,22 @@ void ptdump_check_wx(void)
}
};
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO))
+ return true;
+
ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
- if (st.wx_pages)
+ if (st.wx_pages) {
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
st.wx_pages);
- else
+
+ return false;
+ } else {
pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+ return true;
+ }
}
-#endif
static int __init ptdump_init(void)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 0c94260b5d0c..add5cd30ab34 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -523,6 +523,8 @@ static inline void __set_pte_at(pte_t *ptep, pte_t pteval)
set_pte(ptep, pteval);
}
+#define PFN_PTE_SHIFT _PAGE_PFN_SHIFT
+
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval, unsigned int nr)
{
diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h
deleted file mode 100644
index 3c9ea6dd5af7..000000000000
--- a/arch/riscv/include/asm/ptdump.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2019 SiFive
- */
-
-#ifndef _ASM_RISCV_PTDUMP_H
-#define _ASM_RISCV_PTDUMP_H
-
-void ptdump_check_wx(void);
-
-#ifdef CONFIG_DEBUG_WX
-static inline void debug_checkwx(void)
-{
- ptdump_check_wx();
-}
-#else
-static inline void debug_checkwx(void)
-{
-}
-#endif
-
-#endif /* _ASM_RISCV_PTDUMP_H */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index fa34cf55037b..a4f218cfb845 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -29,7 +29,6 @@
#include <asm/io.h>
#include <asm/numa.h>
#include <asm/pgtable.h>
-#include <asm/ptdump.h>
#include <asm/sections.h>
#include <asm/soc.h>
#include <asm/tlbflush.h>
@@ -723,8 +722,6 @@ void mark_rodata_ro(void)
if (IS_ENABLED(CONFIG_64BIT))
set_kernel_memory(lm_alias(__start_rodata), lm_alias(_data),
set_memory_ro);
-
- debug_checkwx();
}
#else
static __init pgprot_t pgprot_from_va(uintptr_t va)
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 657c27bc07a7..1289cc6d3700 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -9,7 +9,6 @@
#include <linux/seq_file.h>
#include <linux/ptdump.h>
-#include <asm/ptdump.h>
#include <linux/pgtable.h>
#include <asm/kasan.h>
@@ -336,7 +335,7 @@ static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo)
ptdump_walk_pgd(&st.ptdump, pinfo->mm, NULL);
}
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
{
struct pg_state st = {
.seq = NULL,
@@ -357,11 +356,16 @@ void ptdump_check_wx(void)
ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
- if (st.wx_pages)
+ if (st.wx_pages) {
pr_warn("Checked W+X mappings: failed, %lu W+X pages found\n",
st.wx_pages);
- else
+
+ return false;
+ } else {
pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+ return true;
+ }
}
static int ptdump_show(struct seq_file *m, void *v)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index fe565f3a3a91..a1d6dcbc8965 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -113,6 +113,7 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1299b56e43f6..4b91e65c85d9 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1316,6 +1316,8 @@ pgprot_t pgprot_writecombine(pgprot_t prot);
#define pgprot_writethrough pgprot_writethrough
pgprot_t pgprot_writethrough(pgprot_t prot);
+#define PFN_PTE_SHIFT PAGE_SHIFT
+
/*
* Set multiple PTEs to consecutive pages with a single call. All PTEs
* are within the same folio, PMD and VMA.
diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h
deleted file mode 100644
index f960b2896606..000000000000
--- a/arch/s390/include/asm/ptdump.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _ASM_S390_PTDUMP_H
-#define _ASM_S390_PTDUMP_H
-
-void ptdump_check_wx(void);
-
-static inline void debug_checkwx(void)
-{
- if (IS_ENABLED(CONFIG_DEBUG_WX))
- ptdump_check_wx();
-}
-
-#endif /* _ASM_S390_PTDUMP_H */
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index d1455a601adc..e95b2c8081eb 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -25,8 +25,9 @@
void __tlb_remove_table(void *_table);
static inline void tlb_flush(struct mmu_gather *tlb);
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
- struct encoded_page *page,
- int page_size);
+ struct page *page, bool delay_rmap, int page_size);
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
+ struct page *page, unsigned int nr_pages, bool delay_rmap);
#define tlb_flush tlb_flush
#define pte_free_tlb pte_free_tlb
@@ -42,14 +43,29 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
* tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
* has already been freed, so just do free_page_and_swap_cache.
*
- * s390 doesn't delay rmap removal, so there is nothing encoded in
- * the page pointer.
+ * s390 doesn't delay rmap removal.
*/
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
- struct encoded_page *page,
- int page_size)
+ struct page *page, bool delay_rmap, int page_size)
{
- free_page_and_swap_cache(encoded_page_ptr(page));
+ VM_WARN_ON_ONCE(delay_rmap);
+
+ free_page_and_swap_cache(page);
+ return false;
+}
+
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
+ struct page *page, unsigned int nr_pages, bool delay_rmap)
+{
+ struct encoded_page *encoded_pages[] = {
+ encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT),
+ encode_nr_pages(nr_pages),
+ };
+
+ VM_WARN_ON_ONCE(delay_rmap);
+ VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
+
+ free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
return false;
}
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index d37a8f607b71..ffd07ed7b4af 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -6,7 +6,6 @@
#include <linux/mm.h>
#include <linux/kfence.h>
#include <linux/kasan.h>
-#include <asm/ptdump.h>
#include <asm/kasan.h>
#include <asm/abs_lowcore.h>
#include <asm/nospec-branch.h>
@@ -122,7 +121,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level)
static void note_prot_wx(struct pg_state *st, unsigned long addr)
{
-#ifdef CONFIG_DEBUG_WX
if (!st->check_wx)
return;
if (st->current_prot & _PAGE_INVALID)
@@ -139,10 +137,10 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
*/
if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
return;
- WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
+ WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+ "s390/mm: Found insecure W+X mapping at address %pS\n",
(void *)st->start_address);
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
-#endif /* CONFIG_DEBUG_WX */
}
static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
@@ -194,8 +192,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
}
-#ifdef CONFIG_DEBUG_WX
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
{
struct pg_state st = {
.ptdump = {
@@ -218,16 +215,20 @@ void ptdump_check_wx(void)
};
if (!MACHINE_HAS_NX)
- return;
+ return true;
ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
- if (st.wx_pages)
+ if (st.wx_pages) {
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
- else
+
+ return false;
+ } else {
pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
(nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
"unexpected " : "");
+
+ return true;
+ }
}
-#endif /* CONFIG_DEBUG_WX */
#ifdef CONFIG_PTDUMP_DEBUGFS
static int ptdump_show(struct seq_file *m, void *v)
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 43e612bc2bcd..f6391442c0c2 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -37,7 +37,6 @@
#include <asm/pgalloc.h>
#include <asm/ctlreg.h>
#include <asm/kfence.h>
-#include <asm/ptdump.h>
#include <asm/dma.h>
#include <asm/abs_lowcore.h>
#include <asm/tlb.h>
@@ -109,7 +108,6 @@ void mark_rodata_ro(void)
__set_memory_ro(__start_ro_after_init, __end_ro_after_init);
pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
- debug_checkwx();
}
int set_memory_encrypted(unsigned long vaddr, int numpages)
@@ -281,9 +279,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
unsigned long size_pages = PFN_DOWN(size);
int rc;
- if (WARN_ON_ONCE(params->altmap))
- return -EINVAL;
-
if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
return -EINVAL;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 99422926efe1..b71432b15d66 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -721,9 +721,9 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
if (!non_swap_entry(entry))
dec_mm_counter(mm, MM_SWAPENTS);
else if (is_migration_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = pfn_swap_entry_folio(entry);
- dec_mm_counter(mm, mm_counter(page));
+ dec_mm_counter(mm, mm_counter(folio));
}
free_swap_and_cache(entry);
}
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 186a020857cf..eb100479f7be 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -33,8 +33,12 @@ static void __ref *vmem_alloc_pages(unsigned int order)
return memblock_alloc(size, size);
}
-static void vmem_free_pages(unsigned long addr, int order)
+static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap)
{
+ if (altmap) {
+ vmem_altmap_free(altmap, 1 << order);
+ return;
+ }
/* We don't expect boot memory to be removed ever. */
if (!slab_is_available() ||
WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr))))
@@ -156,7 +160,8 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
- unsigned long end, bool add, bool direct)
+ unsigned long end, bool add, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long prot, pages = 0;
int ret = -ENOMEM;
@@ -172,11 +177,11 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
if (pte_none(*pte))
continue;
if (!direct)
- vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
+ vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap);
pte_clear(&init_mm, addr, pte);
} else if (pte_none(*pte)) {
if (!direct) {
- void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+ void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap);
if (!new_page)
goto out;
@@ -213,7 +218,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start)
/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
- unsigned long end, bool add, bool direct)
+ unsigned long end, bool add, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long next, prot, pages = 0;
int ret = -ENOMEM;
@@ -234,11 +240,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
if (IS_ALIGNED(addr, PMD_SIZE) &&
IS_ALIGNED(next, PMD_SIZE)) {
if (!direct)
- vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
pmd_clear(pmd);
pages++;
} else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
- vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
pmd_clear(pmd);
}
continue;
@@ -261,7 +267,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
* page tables since vmemmap_populate gets
* called for each section separately.
*/
- new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
+ new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap);
if (new_page) {
set_pmd(pmd, __pmd(__pa(new_page) | prot));
if (!IS_ALIGNED(addr, PMD_SIZE) ||
@@ -280,7 +286,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
vmemmap_use_sub_pmd(addr, next);
continue;
}
- ret = modify_pte_table(pmd, addr, next, add, direct);
+ ret = modify_pte_table(pmd, addr, next, add, direct, altmap);
if (ret)
goto out;
if (!add)
@@ -302,12 +308,12 @@ static void try_free_pmd_table(pud_t *pud, unsigned long start)
for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
if (!pmd_none(*pmd))
return;
- vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+ vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL);
pud_clear(pud);
}
static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
- bool add, bool direct)
+ bool add, bool direct, struct vmem_altmap *altmap)
{
unsigned long next, prot, pages = 0;
int ret = -ENOMEM;
@@ -347,7 +353,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
} else if (pud_large(*pud)) {
continue;
}
- ret = modify_pmd_table(pud, addr, next, add, direct);
+ ret = modify_pmd_table(pud, addr, next, add, direct, altmap);
if (ret)
goto out;
if (!add)
@@ -370,12 +376,12 @@ static void try_free_pud_table(p4d_t *p4d, unsigned long start)
if (!pud_none(*pud))
return;
}
- vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+ vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL);
p4d_clear(p4d);
}
static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
- bool add, bool direct)
+ bool add, bool direct, struct vmem_altmap *altmap)
{
unsigned long next;
int ret = -ENOMEM;
@@ -394,7 +400,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
goto out;
p4d_populate(&init_mm, p4d, pud);
}
- ret = modify_pud_table(p4d, addr, next, add, direct);
+ ret = modify_pud_table(p4d, addr, next, add, direct, altmap);
if (ret)
goto out;
if (!add)
@@ -415,12 +421,12 @@ static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
if (!p4d_none(*p4d))
return;
}
- vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+ vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL);
pgd_clear(pgd);
}
static int modify_pagetable(unsigned long start, unsigned long end, bool add,
- bool direct)
+ bool direct, struct vmem_altmap *altmap)
{
unsigned long addr, next;
int ret = -ENOMEM;
@@ -445,7 +451,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
goto out;
pgd_populate(&init_mm, pgd, p4d);
}
- ret = modify_p4d_table(pgd, addr, next, add, direct);
+ ret = modify_p4d_table(pgd, addr, next, add, direct, altmap);
if (ret)
goto out;
if (!add)
@@ -458,14 +464,16 @@ out:
return ret;
}
-static int add_pagetable(unsigned long start, unsigned long end, bool direct)
+static int add_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- return modify_pagetable(start, end, true, direct);
+ return modify_pagetable(start, end, true, direct, altmap);
}
-static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- return modify_pagetable(start, end, false, direct);
+ return modify_pagetable(start, end, false, direct, altmap);
}
/*
@@ -474,7 +482,7 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
static int vmem_add_range(unsigned long start, unsigned long size)
{
start = (unsigned long)__va(start);
- return add_pagetable(start, start + size, true);
+ return add_pagetable(start, start + size, true, NULL);
}
/*
@@ -483,7 +491,7 @@ static int vmem_add_range(unsigned long start, unsigned long size)
static void vmem_remove_range(unsigned long start, unsigned long size)
{
start = (unsigned long)__va(start);
- remove_pagetable(start, start + size, true);
+ remove_pagetable(start, start + size, true, NULL);
}
/*
@@ -496,9 +504,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
mutex_lock(&vmem_mutex);
/* We don't care about the node, just use NUMA_NO_NODE on allocations */
- ret = add_pagetable(start, end, false);
+ ret = add_pagetable(start, end, false, altmap);
if (ret)
- remove_pagetable(start, end, false);
+ remove_pagetable(start, end, false, altmap);
mutex_unlock(&vmem_mutex);
return ret;
}
@@ -509,7 +517,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
mutex_lock(&vmem_mutex);
- remove_pagetable(start, end, false);
+ remove_pagetable(start, end, false, altmap);
mutex_unlock(&vmem_mutex);
}
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7500521b2b98..2ad3e29f0ebe 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -2,6 +2,7 @@
config SUPERH
def_bool y
select ARCH_32BIT_OFF_T
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU
select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU
select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
diff --git a/arch/sh/include/asm/cachetype.h b/arch/sh/include/asm/cachetype.h
new file mode 100644
index 000000000000..a5fffe536068
--- /dev/null
+++ b/arch/sh/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_SH_CACHETYPE_H
+#define __ASM_SH_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing() true
+
+#endif
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 204c43cb3d43..cbec48219d9e 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -13,6 +13,7 @@ config 64BIT
config SPARC
bool
default y
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI
select ARCH_MIGHT_HAVE_PC_SERIO
select DMA_OPS
diff --git a/arch/sparc/include/asm/cachetype.h b/arch/sparc/include/asm/cachetype.h
new file mode 100644
index 000000000000..caf1c0045892
--- /dev/null
+++ b/arch/sparc/include/asm/cachetype.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_SPARC_CACHETYPE_H
+#define __ASM_SPARC_CACHETYPE_H
+
+#include <asm/page.h>
+
+#ifdef CONFIG_SPARC32
+extern int vac_cache_size;
+#define cpu_dcache_is_aliasing() (vac_cache_size > PAGE_SIZE)
+#else
+#define cpu_dcache_is_aliasing() (L1DCACHE_SIZE > PAGE_SIZE)
+#endif
+
+#endif
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index a8c871b7d786..652af9d63fa2 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -929,6 +929,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm, PAGE_SHIFT);
}
+#define PFN_PTE_SHIFT PAGE_SHIFT
+
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr)
{
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 0da5c227f490..ce4677b8b735 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -75,7 +75,7 @@ typedef struct {
.lock = __MUTEX_INITIALIZER(mm.context.lock), \
}
-void leave_mm(int cpu);
+void leave_mm(void);
#define leave_mm leave_mm
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9d077bca6a10..69ed0ea0641b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -31,7 +31,8 @@ struct seq_file;
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
bool user);
-void ptdump_walk_pgd_level_checkwx(void);
+bool ptdump_walk_pgd_level_checkwx(void);
+#define ptdump_check_wx ptdump_walk_pgd_level_checkwx
void ptdump_walk_user_pgd_level_checkwx(void);
/*
@@ -41,10 +42,8 @@ void ptdump_walk_user_pgd_level_checkwx(void);
#define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot)))
#ifdef CONFIG_DEBUG_WX
-#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx()
#else
-#define debug_checkwx() do { } while (0)
#define debug_checkwx_user() do { } while (0)
#endif
@@ -956,13 +955,13 @@ static inline int pte_same(pte_t a, pte_t b)
return a.pte == b.pte;
}
-static inline pte_t pte_next_pfn(pte_t pte)
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
if (__pte_needs_invert(pte_val(pte)))
- return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT));
- return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+ return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT));
+ return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
-#define pte_next_pfn pte_next_pfn
+#define pte_advance_pfn pte_advance_pfn
static inline int pte_present(pte_t a)
{
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1d85cb7071cb..21108d8e6f6b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1805,7 +1805,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
* restoring the previous mm.
*/
if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
- leave_mm(smp_processor_id());
+ leave_mm();
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e1b599ecbbc2..35b2cfd47914 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -362,9 +362,9 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
}
-static void ptdump_walk_pgd_level_core(struct seq_file *m,
- struct mm_struct *mm, pgd_t *pgd,
- bool checkwx, bool dmesg)
+bool ptdump_walk_pgd_level_core(struct seq_file *m,
+ struct mm_struct *mm, pgd_t *pgd,
+ bool checkwx, bool dmesg)
{
const struct ptdump_range ptdump_ranges[] = {
#ifdef CONFIG_X86_64
@@ -391,12 +391,17 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m,
ptdump_walk_pgd(&st.ptdump, mm, pgd);
if (!checkwx)
- return;
- if (st.wx_pages)
+ return true;
+ if (st.wx_pages) {
pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
st.wx_pages);
- else
+
+ return false;
+ } else {
pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
+
+ return true;
+ }
}
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
@@ -431,9 +436,12 @@ void ptdump_walk_user_pgd_level_checkwx(void)
#endif
}
-void ptdump_walk_pgd_level_checkwx(void)
+bool ptdump_walk_pgd_level_checkwx(void)
{
- ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return true;
+
+ return ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
}
static int __init pt_dump_init(void)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b63403d7179d..5c736b707cae 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -800,6 +800,4 @@ void mark_rodata_ro(void)
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();
- if (__supported_pte_mask & _PAGE_NX)
- debug_checkwx();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a0dffaca6d2b..ebdbcae48011 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1412,8 +1412,6 @@ void mark_rodata_ro(void)
(void *)text_end, (void *)rodata_start);
free_kernel_image_pages("unused kernel image (rodata/data gap)",
(void *)rodata_end, (void *)_sdata);
-
- debug_checkwx();
}
/*
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5768d386efab..bf9605caf24f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -299,7 +299,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
write_cr3(new_mm_cr3);
}
-void leave_mm(int cpu)
+void leave_mm(void)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -492,10 +492,16 @@ void cr4_update_pce(void *ignored)
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
#endif
-void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+/*
+ * The "prev" argument passed by the caller does not always match CR3. For
+ * example, the scheduler passes in active_mm when switching from lazy TLB mode
+ * to normal mode, but switch_mm_irqs_off() can be called from x86 code without
+ * updating active_mm. Use cpu_tlbstate.loaded_mm instead.
+ */
+void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
struct task_struct *tsk)
{
- struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
@@ -504,15 +510,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
bool need_flush;
u16 new_asid;
- /*
- * NB: The scheduler will call us with prev == next when switching
- * from lazy TLB mode to normal mode if active_mm isn't changing.
- * When this happens, we don't assume that CR3 (and hence
- * cpu_tlbstate.loaded_mm) matches next.
- *
- * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
- */
-
/* We don't want flush_tlb_func() to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
@@ -527,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
tlbstate_lam_cr3_mask()))) {
/*
* If we were to BUG here, we'd be very likely to kill
@@ -559,7 +556,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* provides that full memory barrier and core serializing
* instruction.
*/
- if (real_prev == next) {
+ if (prev == next) {
/* Not actually switching mm's */
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
@@ -574,7 +571,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
- if (WARN_ON_ONCE(real_prev != &init_mm &&
+ if (WARN_ON_ONCE(prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -616,10 +613,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
* but the bitmap manipulation can cause cache line contention.
*/
- if (real_prev != &init_mm) {
+ if (prev != &init_mm) {
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
- mm_cpumask(real_prev)));
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ mm_cpumask(prev)));
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
}
/*
@@ -656,9 +653,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
- if (next != real_prev) {
+ if (next != prev) {
cr4_update_pce_mm(next);
- switch_ldt(real_prev, next);
+ switch_ldt(prev, next);
}
}
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 72af496a160c..218773cfb009 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -913,7 +913,7 @@ static void drop_mm_ref_this_cpu(void *info)
struct mm_struct *mm = info;
if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
- leave_mm(smp_processor_id());
+ leave_mm();
/*
* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 6f248d87e496..6689a8547346 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -2,6 +2,7 @@
config XTENSA
def_bool y
select ARCH_32BIT_OFF_T
+ select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_BINFMT_FLAT if !MMU
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/xtensa/include/asm/cachetype.h b/arch/xtensa/include/asm/cachetype.h
new file mode 100644
index 000000000000..51bd49e2a1c5
--- /dev/null
+++ b/arch/xtensa/include/asm/cachetype.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_XTENSA_CACHETYPE_H
+#define __ASM_XTENSA_CACHETYPE_H
+
+#include <asm/cache.h>
+#include <asm/page.h>
+
+#define cpu_dcache_is_aliasing() (DCACHE_WAY_SIZE > PAGE_SIZE)
+
+#endif
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index f1e79263fe61..23b8cba4a2a3 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -898,6 +898,37 @@ err:
return rc;
}
+static unsigned int cpu_map_shared_cache(bool online, unsigned int cpu,
+ cpumask_t **map)
+{
+ struct cacheinfo *llc, *sib_llc;
+ unsigned int sibling;
+
+ if (!last_level_cache_is_valid(cpu))
+ return 0;
+
+ llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
+
+ if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED)
+ return 0;
+
+ if (online) {
+ *map = &llc->shared_cpu_map;
+ return cpumask_weight(*map);
+ }
+
+ /* shared_cpu_map of offlined CPU will be cleared, so use sibling map */
+ for_each_cpu(sibling, &llc->shared_cpu_map) {
+ if (sibling == cpu || !last_level_cache_is_valid(sibling))
+ continue;
+ sib_llc = per_cpu_cacheinfo_idx(sibling, cache_leaves(sibling) - 1);
+ *map = &sib_llc->shared_cpu_map;
+ return cpumask_weight(*map);
+ }
+
+ return 0;
+}
+
/*
* Calculate the size of the per-CPU data cache slice. This can be
* used to estimate the size of the data cache slice that can be used
@@ -929,28 +960,31 @@ static void update_per_cpu_data_slice_size_cpu(unsigned int cpu)
ci->per_cpu_data_slice_size = llc->size / nr_shared;
}
-static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu)
+static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu,
+ cpumask_t *cpu_map)
{
unsigned int icpu;
- for_each_online_cpu(icpu) {
+ for_each_cpu(icpu, cpu_map) {
if (!cpu_online && icpu == cpu)
continue;
update_per_cpu_data_slice_size_cpu(icpu);
+ setup_pcp_cacheinfo(icpu);
}
}
static int cacheinfo_cpu_online(unsigned int cpu)
{
int rc = detect_cache_attributes(cpu);
+ cpumask_t *cpu_map;
if (rc)
return rc;
rc = cache_add_dev(cpu);
if (rc)
goto err;
- update_per_cpu_data_slice_size(true, cpu);
- setup_pcp_cacheinfo();
+ if (cpu_map_shared_cache(true, cpu, &cpu_map))
+ update_per_cpu_data_slice_size(true, cpu, cpu_map);
return 0;
err:
free_cache_attributes(cpu);
@@ -959,12 +993,16 @@ err:
static int cacheinfo_cpu_pre_down(unsigned int cpu)
{
+ cpumask_t *cpu_map;
+ unsigned int nr_shared;
+
+ nr_shared = cpu_map_shared_cache(false, cpu, &cpu_map);
if (cpumask_test_and_clear_cpu(cpu, &cache_dev_map))
cpu_cache_sysfs_exit(cpu);
free_cache_attributes(cpu);
- update_per_cpu_data_slice_size(false, cpu);
- setup_pcp_cacheinfo();
+ if (nr_shared > 1)
+ update_per_cpu_data_slice_size(false, cpu, cpu_map);
return 0;
}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 14f964a7719b..c0436f46cfb7 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -188,6 +188,7 @@ static int memory_block_online(struct memory_block *mem)
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
unsigned long nr_vmemmap_pages = 0;
+ struct memory_notify arg;
struct zone *zone;
int ret;
@@ -207,9 +208,19 @@ static int memory_block_online(struct memory_block *mem)
if (mem->altmap)
nr_vmemmap_pages = mem->altmap->free;
+ arg.altmap_start_pfn = start_pfn;
+ arg.altmap_nr_pages = nr_vmemmap_pages;
+ arg.start_pfn = start_pfn + nr_vmemmap_pages;
+ arg.nr_pages = nr_pages - nr_vmemmap_pages;
mem_hotplug_begin();
+ ret = memory_notify(MEM_PREPARE_ONLINE, &arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto out_notifier;
+
if (nr_vmemmap_pages) {
- ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
+ ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
+ zone, mem->altmap->inaccessible);
if (ret)
goto out;
}
@@ -231,7 +242,11 @@ static int memory_block_online(struct memory_block *mem)
nr_vmemmap_pages);
mem->zone = zone;
+ mem_hotplug_done();
+ return ret;
out:
+ memory_notify(MEM_FINISH_OFFLINE, &arg);
+out_notifier:
mem_hotplug_done();
return ret;
}
@@ -244,6 +259,7 @@ static int memory_block_offline(struct memory_block *mem)
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
unsigned long nr_vmemmap_pages = 0;
+ struct memory_notify arg;
int ret;
if (!mem->zone)
@@ -275,6 +291,11 @@ static int memory_block_offline(struct memory_block *mem)
mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
mem->zone = NULL;
+ arg.altmap_start_pfn = start_pfn;
+ arg.altmap_nr_pages = nr_vmemmap_pages;
+ arg.start_pfn = start_pfn + nr_vmemmap_pages;
+ arg.nr_pages = nr_pages - nr_vmemmap_pages;
+ memory_notify(MEM_FINISH_OFFLINE, &arg);
out:
mem_hotplug_done();
return ret;
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 55af4efd7983..8237b08c49d8 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -11,6 +11,7 @@
#include <linux/sched.h>
#include <linux/cpu.h>
#include <linux/crypto.h>
+#include <linux/vmalloc.h>
#include "zcomp.h"
@@ -37,7 +38,7 @@ static void zcomp_strm_free(struct zcomp_strm *zstrm)
{
if (!IS_ERR_OR_NULL(zstrm->tfm))
crypto_free_comp(zstrm->tfm);
- free_pages((unsigned long)zstrm->buffer, 1);
+ vfree(zstrm->buffer);
zstrm->tfm = NULL;
zstrm->buffer = NULL;
}
@@ -53,7 +54,7 @@ static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp)
* allocate 2 pages. 1 for compressed data, plus 1 extra for the
* case when compressed size is larger than the original one
*/
- zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+ zstrm->buffer = vzalloc(2 * PAGE_SIZE);
if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) {
zcomp_strm_free(zstrm);
return -ENOMEM;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 6772e0c654fa..242a1fece18d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1337,7 +1337,7 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page,
src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
if (size == PAGE_SIZE) {
dst = kmap_local_page(page);
- memcpy(dst, src, PAGE_SIZE);
+ copy_page(dst, src);
kunmap_local(dst);
ret = 0;
} else {
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 737a026ef58a..02e40fd7d948 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -237,7 +237,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
}
if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
- leave_mm(dev->cpu);
+ leave_mm();
/* Take note of the planned idle state. */
sched_idle_set_state(target_state);
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 1ff1ab5fa105..27c86d0ca711 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -12,6 +12,18 @@
static DEFINE_MUTEX(dax_bus_lock);
+/*
+ * All changes to the dax region configuration occur with this lock held
+ * for write.
+ */
+DECLARE_RWSEM(dax_region_rwsem);
+
+/*
+ * All changes to the dax device configuration occur with this lock held
+ * for write.
+ */
+DECLARE_RWSEM(dax_dev_rwsem);
+
#define DAX_NAME_LEN 30
struct dax_id {
struct list_head list;
@@ -180,7 +192,7 @@ static u64 dev_dax_size(struct dev_dax *dev_dax)
u64 size = 0;
int i;
- device_lock_assert(&dev_dax->dev);
+ WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem));
for (i = 0; i < dev_dax->nr_range; i++)
size += range_len(&dev_dax->ranges[i].range);
@@ -194,8 +206,15 @@ static int dax_bus_probe(struct device *dev)
struct dev_dax *dev_dax = to_dev_dax(dev);
struct dax_region *dax_region = dev_dax->region;
int rc;
+ u64 size;
+
+ rc = down_read_interruptible(&dax_dev_rwsem);
+ if (rc)
+ return rc;
+ size = dev_dax_size(dev_dax);
+ up_read(&dax_dev_rwsem);
- if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0)
+ if (size == 0 || dev_dax->id < 0)
return -ENXIO;
rc = dax_drv->probe(dev_dax);
@@ -250,7 +269,7 @@ static ssize_t id_show(struct device *dev,
{
struct dax_region *dax_region = dev_get_drvdata(dev);
- return sprintf(buf, "%d\n", dax_region->id);
+ return sysfs_emit(buf, "%d\n", dax_region->id);
}
static DEVICE_ATTR_RO(id);
@@ -259,8 +278,8 @@ static ssize_t region_size_show(struct device *dev,
{
struct dax_region *dax_region = dev_get_drvdata(dev);
- return sprintf(buf, "%llu\n", (unsigned long long)
- resource_size(&dax_region->res));
+ return sysfs_emit(buf, "%llu\n",
+ (unsigned long long)resource_size(&dax_region->res));
}
static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
region_size_show, NULL);
@@ -270,7 +289,7 @@ static ssize_t region_align_show(struct device *dev,
{
struct dax_region *dax_region = dev_get_drvdata(dev);
- return sprintf(buf, "%u\n", dax_region->align);
+ return sysfs_emit(buf, "%u\n", dax_region->align);
}
static struct device_attribute dev_attr_region_align =
__ATTR(align, 0400, region_align_show, NULL);
@@ -283,7 +302,7 @@ static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
resource_size_t size = resource_size(&dax_region->res);
struct resource *res;
- device_lock_assert(dax_region->dev);
+ WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
for_each_dax_region_resource(dax_region, res)
size -= resource_size(res);
@@ -295,12 +314,15 @@ static ssize_t available_size_show(struct device *dev,
{
struct dax_region *dax_region = dev_get_drvdata(dev);
unsigned long long size;
+ int rc;
- device_lock(dev);
+ rc = down_read_interruptible(&dax_region_rwsem);
+ if (rc)
+ return rc;
size = dax_region_avail_size(dax_region);
- device_unlock(dev);
+ up_read(&dax_region_rwsem);
- return sprintf(buf, "%llu\n", size);
+ return sysfs_emit(buf, "%llu\n", size);
}
static DEVICE_ATTR_RO(available_size);
@@ -314,10 +336,12 @@ static ssize_t seed_show(struct device *dev,
if (is_static(dax_region))
return -EINVAL;
- device_lock(dev);
+ rc = down_read_interruptible(&dax_region_rwsem);
+ if (rc)
+ return rc;
seed = dax_region->seed;
- rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
- device_unlock(dev);
+ rc = sysfs_emit(buf, "%s\n", seed ? dev_name(seed) : "");
+ up_read(&dax_region_rwsem);
return rc;
}
@@ -333,14 +357,18 @@ static ssize_t create_show(struct device *dev,
if (is_static(dax_region))
return -EINVAL;
- device_lock(dev);
+ rc = down_read_interruptible(&dax_region_rwsem);
+ if (rc)
+ return rc;
youngest = dax_region->youngest;
- rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
- device_unlock(dev);
+ rc = sysfs_emit(buf, "%s\n", youngest ? dev_name(youngest) : "");
+ up_read(&dax_region_rwsem);
return rc;
}
+static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data);
+
static ssize_t create_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
@@ -358,7 +386,9 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr,
if (val != 1)
return -EINVAL;
- device_lock(dev);
+ rc = down_write_killable(&dax_region_rwsem);
+ if (rc)
+ return rc;
avail = dax_region_avail_size(dax_region);
if (avail == 0)
rc = -ENOSPC;
@@ -369,7 +399,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr,
.id = -1,
.memmap_on_memory = false,
};
- struct dev_dax *dev_dax = devm_create_dev_dax(&data);
+ struct dev_dax *dev_dax = __devm_create_dev_dax(&data);
if (IS_ERR(dev_dax))
rc = PTR_ERR(dev_dax);
@@ -387,7 +417,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr,
rc = len;
}
}
- device_unlock(dev);
+ up_write(&dax_region_rwsem);
return rc;
}
@@ -417,7 +447,7 @@ static void trim_dev_dax_range(struct dev_dax *dev_dax)
struct range *range = &dev_dax->ranges[i].range;
struct dax_region *dax_region = dev_dax->region;
- device_lock_assert(dax_region->dev);
+ WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i,
(unsigned long long)range->start,
(unsigned long long)range->end);
@@ -435,7 +465,7 @@ static void free_dev_dax_ranges(struct dev_dax *dev_dax)
trim_dev_dax_range(dev_dax);
}
-static void unregister_dev_dax(void *dev)
+static void __unregister_dev_dax(void *dev)
{
struct dev_dax *dev_dax = to_dev_dax(dev);
@@ -447,6 +477,17 @@ static void unregister_dev_dax(void *dev)
put_device(dev);
}
+static void unregister_dev_dax(void *dev)
+{
+ if (rwsem_is_locked(&dax_region_rwsem))
+ return __unregister_dev_dax(dev);
+
+ if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0))
+ return;
+ __unregister_dev_dax(dev);
+ up_write(&dax_region_rwsem);
+}
+
static void dax_region_free(struct kref *kref)
{
struct dax_region *dax_region;
@@ -463,11 +504,10 @@ static void dax_region_put(struct dax_region *dax_region)
/* a return value >= 0 indicates this invocation invalidated the id */
static int __free_dev_dax_id(struct dev_dax *dev_dax)
{
- struct device *dev = &dev_dax->dev;
struct dax_region *dax_region;
int rc = dev_dax->id;
- device_lock_assert(dev);
+ WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem));
if (!dev_dax->dyn_id || dev_dax->id < 0)
return -1;
@@ -480,12 +520,13 @@ static int __free_dev_dax_id(struct dev_dax *dev_dax)
static int free_dev_dax_id(struct dev_dax *dev_dax)
{
- struct device *dev = &dev_dax->dev;
int rc;
- device_lock(dev);
+ rc = down_write_killable(&dax_dev_rwsem);
+ if (rc)
+ return rc;
rc = __free_dev_dax_id(dev_dax);
- device_unlock(dev);
+ up_write(&dax_dev_rwsem);
return rc;
}
@@ -519,8 +560,14 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr,
if (!victim)
return -ENXIO;
- device_lock(dev);
- device_lock(victim);
+ rc = down_write_killable(&dax_region_rwsem);
+ if (rc)
+ return rc;
+ rc = down_write_killable(&dax_dev_rwsem);
+ if (rc) {
+ up_write(&dax_region_rwsem);
+ return rc;
+ }
dev_dax = to_dev_dax(victim);
if (victim->driver || dev_dax_size(dev_dax))
rc = -EBUSY;
@@ -541,12 +588,12 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr,
} else
rc = -EBUSY;
}
- device_unlock(victim);
+ up_write(&dax_dev_rwsem);
/* won the race to invalidate the device, clean it up */
if (do_del)
devm_release_action(dev, unregister_dev_dax, victim);
- device_unlock(dev);
+ up_write(&dax_region_rwsem);
put_device(victim);
return rc;
@@ -658,16 +705,15 @@ static void dax_mapping_release(struct device *dev)
put_device(parent);
}
-static void unregister_dax_mapping(void *data)
+static void __unregister_dax_mapping(void *data)
{
struct device *dev = data;
struct dax_mapping *mapping = to_dax_mapping(dev);
struct dev_dax *dev_dax = to_dev_dax(dev->parent);
- struct dax_region *dax_region = dev_dax->region;
dev_dbg(dev, "%s\n", __func__);
- device_lock_assert(dax_region->dev);
+ WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
dev_dax->ranges[mapping->range_id].mapping = NULL;
mapping->range_id = -1;
@@ -675,28 +721,37 @@ static void unregister_dax_mapping(void *data)
device_unregister(dev);
}
+static void unregister_dax_mapping(void *data)
+{
+ if (rwsem_is_locked(&dax_region_rwsem))
+ return __unregister_dax_mapping(data);
+
+ if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0))
+ return;
+ __unregister_dax_mapping(data);
+ up_write(&dax_region_rwsem);
+}
+
static struct dev_dax_range *get_dax_range(struct device *dev)
{
struct dax_mapping *mapping = to_dax_mapping(dev);
struct dev_dax *dev_dax = to_dev_dax(dev->parent);
- struct dax_region *dax_region = dev_dax->region;
+ int rc;
- device_lock(dax_region->dev);
+ rc = down_write_killable(&dax_region_rwsem);
+ if (rc)
+ return NULL;
if (mapping->range_id < 0) {
- device_unlock(dax_region->dev);
+ up_write(&dax_region_rwsem);
return NULL;
}
return &dev_dax->ranges[mapping->range_id];
}
-static void put_dax_range(struct dev_dax_range *dax_range)
+static void put_dax_range(void)
{
- struct dax_mapping *mapping = dax_range->mapping;
- struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent);
- struct dax_region *dax_region = dev_dax->region;
-
- device_unlock(dax_region->dev);
+ up_write(&dax_region_rwsem);
}
static ssize_t start_show(struct device *dev,
@@ -708,8 +763,8 @@ static ssize_t start_show(struct device *dev,
dax_range = get_dax_range(dev);
if (!dax_range)
return -ENXIO;
- rc = sprintf(buf, "%#llx\n", dax_range->range.start);
- put_dax_range(dax_range);
+ rc = sysfs_emit(buf, "%#llx\n", dax_range->range.start);
+ put_dax_range();
return rc;
}
@@ -724,8 +779,8 @@ static ssize_t end_show(struct device *dev,
dax_range = get_dax_range(dev);
if (!dax_range)
return -ENXIO;
- rc = sprintf(buf, "%#llx\n", dax_range->range.end);
- put_dax_range(dax_range);
+ rc = sysfs_emit(buf, "%#llx\n", dax_range->range.end);
+ put_dax_range();
return rc;
}
@@ -740,8 +795,8 @@ static ssize_t pgoff_show(struct device *dev,
dax_range = get_dax_range(dev);
if (!dax_range)
return -ENXIO;
- rc = sprintf(buf, "%#lx\n", dax_range->pgoff);
- put_dax_range(dax_range);
+ rc = sysfs_emit(buf, "%#lx\n", dax_range->pgoff);
+ put_dax_range();
return rc;
}
@@ -775,7 +830,7 @@ static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id)
struct device *dev;
int rc;
- device_lock_assert(dax_region->dev);
+ WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver,
"region disabled\n"))
@@ -821,7 +876,7 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
struct resource *alloc;
int i, rc;
- device_lock_assert(dax_region->dev);
+ WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
/* handle the seed alloc special case */
if (!size) {
@@ -875,13 +930,12 @@ static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, r
{
int last_range = dev_dax->nr_range - 1;
struct dev_dax_range *dax_range = &dev_dax->ranges[last_range];
- struct dax_region *dax_region = dev_dax->region;
bool is_shrink = resource_size(res) > size;
struct range *range = &dax_range->range;
struct device *dev = &dev_dax->dev;
int rc;
- device_lock_assert(dax_region->dev);
+ WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n"))
return -EINVAL;
@@ -907,12 +961,15 @@ static ssize_t size_show(struct device *dev,
{
struct dev_dax *dev_dax = to_dev_dax(dev);
unsigned long long size;
+ int rc;
- device_lock(dev);
+ rc = down_write_killable(&dax_dev_rwsem);
+ if (rc)
+ return rc;
size = dev_dax_size(dev_dax);
- device_unlock(dev);
+ up_write(&dax_dev_rwsem);
- return sprintf(buf, "%llu\n", size);
+ return sysfs_emit(buf, "%llu\n", size);
}
static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size)
@@ -1080,17 +1137,27 @@ static ssize_t size_store(struct device *dev, struct device_attribute *attr,
return -EINVAL;
}
- device_lock(dax_region->dev);
+ rc = down_write_killable(&dax_region_rwsem);
+ if (rc)
+ return rc;
if (!dax_region->dev->driver) {
- device_unlock(dax_region->dev);
- return -ENXIO;
+ rc = -ENXIO;
+ goto err_region;
}
- device_lock(dev);
+ rc = down_write_killable(&dax_dev_rwsem);
+ if (rc)
+ goto err_dev;
+
rc = dev_dax_resize(dax_region, dev_dax, val);
- device_unlock(dev);
- device_unlock(dax_region->dev);
- return rc == 0 ? len : rc;
+err_dev:
+ up_write(&dax_dev_rwsem);
+err_region:
+ up_write(&dax_region_rwsem);
+
+ if (rc == 0)
+ return len;
+ return rc;
}
static DEVICE_ATTR_RW(size);
@@ -1138,18 +1205,24 @@ static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
return rc;
rc = -ENXIO;
- device_lock(dax_region->dev);
+ rc = down_write_killable(&dax_region_rwsem);
+ if (rc)
+ return rc;
if (!dax_region->dev->driver) {
- device_unlock(dax_region->dev);
+ up_write(&dax_region_rwsem);
+ return rc;
+ }
+ rc = down_write_killable(&dax_dev_rwsem);
+ if (rc) {
+ up_write(&dax_region_rwsem);
return rc;
}
- device_lock(dev);
to_alloc = range_len(&r);
if (alloc_is_aligned(dev_dax, to_alloc))
rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc);
- device_unlock(dev);
- device_unlock(dax_region->dev);
+ up_write(&dax_dev_rwsem);
+ up_write(&dax_region_rwsem);
return rc == 0 ? len : rc;
}
@@ -1160,7 +1233,7 @@ static ssize_t align_show(struct device *dev,
{
struct dev_dax *dev_dax = to_dev_dax(dev);
- return sprintf(buf, "%d\n", dev_dax->align);
+ return sysfs_emit(buf, "%d\n", dev_dax->align);
}
static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax)
@@ -1196,13 +1269,19 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr,
if (!dax_align_valid(val))
return -EINVAL;
- device_lock(dax_region->dev);
+ rc = down_write_killable(&dax_region_rwsem);
+ if (rc)
+ return rc;
if (!dax_region->dev->driver) {
- device_unlock(dax_region->dev);
+ up_write(&dax_region_rwsem);
return -ENXIO;
}
- device_lock(dev);
+ rc = down_write_killable(&dax_dev_rwsem);
+ if (rc) {
+ up_write(&dax_region_rwsem);
+ return rc;
+ }
if (dev->driver) {
rc = -EBUSY;
goto out_unlock;
@@ -1214,8 +1293,8 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr,
if (rc)
dev_dax->align = align_save;
out_unlock:
- device_unlock(dev);
- device_unlock(dax_region->dev);
+ up_write(&dax_dev_rwsem);
+ up_write(&dax_region_rwsem);
return rc == 0 ? len : rc;
}
static DEVICE_ATTR_RW(align);
@@ -1232,7 +1311,7 @@ static ssize_t target_node_show(struct device *dev,
{
struct dev_dax *dev_dax = to_dev_dax(dev);
- return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax));
+ return sysfs_emit(buf, "%d\n", dev_dax_target_node(dev_dax));
}
static DEVICE_ATTR_RO(target_node);
@@ -1248,7 +1327,7 @@ static ssize_t resource_show(struct device *dev,
else
start = dev_dax->ranges[0].range.start;
- return sprintf(buf, "%#llx\n", start);
+ return sysfs_emit(buf, "%#llx\n", start);
}
static DEVICE_ATTR(resource, 0400, resource_show, NULL);
@@ -1259,17 +1338,59 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
* We only ever expect to handle device-dax instances, i.e. the
* @type argument to MODULE_ALIAS_DAX_DEVICE() is always zero
*/
- return sprintf(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0);
+ return sysfs_emit(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0);
}
static DEVICE_ATTR_RO(modalias);
static ssize_t numa_node_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", dev_to_node(dev));
+ return sysfs_emit(buf, "%d\n", dev_to_node(dev));
}
static DEVICE_ATTR_RO(numa_node);
+static ssize_t memmap_on_memory_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dev_dax *dev_dax = to_dev_dax(dev);
+
+ return sysfs_emit(buf, "%d\n", dev_dax->memmap_on_memory);
+}
+
+static ssize_t memmap_on_memory_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dev_dax *dev_dax = to_dev_dax(dev);
+ bool val;
+ int rc;
+
+ rc = kstrtobool(buf, &val);
+ if (rc)
+ return rc;
+
+ if (val == true && !mhp_supports_memmap_on_memory()) {
+ dev_dbg(dev, "memmap_on_memory is not available\n");
+ return -EOPNOTSUPP;
+ }
+
+ rc = down_write_killable(&dax_dev_rwsem);
+ if (rc)
+ return rc;
+
+ if (dev_dax->memmap_on_memory != val && dev->driver &&
+ to_dax_drv(dev->driver)->type == DAXDRV_KMEM_TYPE) {
+ up_write(&dax_dev_rwsem);
+ return -EBUSY;
+ }
+
+ dev_dax->memmap_on_memory = val;
+ up_write(&dax_dev_rwsem);
+
+ return len;
+}
+static DEVICE_ATTR_RW(memmap_on_memory);
+
static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
@@ -1296,6 +1417,7 @@ static struct attribute *dev_dax_attributes[] = {
&dev_attr_align.attr,
&dev_attr_resource.attr,
&dev_attr_numa_node.attr,
+ &dev_attr_memmap_on_memory.attr,
NULL,
};
@@ -1325,7 +1447,7 @@ static const struct device_type dev_dax_type = {
.groups = dax_attribute_groups,
};
-struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
+static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data)
{
struct dax_region *dax_region = data->dax_region;
struct device *parent = dax_region->dev;
@@ -1440,6 +1562,21 @@ err_id:
return ERR_PTR(rc);
}
+
+struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
+{
+ struct dev_dax *dev_dax;
+ int rc;
+
+ rc = down_write_killable(&dax_region_rwsem);
+ if (rc)
+ return ERR_PTR(rc);
+
+ dev_dax = __devm_create_dev_dax(data);
+ up_write(&dax_region_rwsem);
+
+ return dev_dax;
+}
EXPORT_SYMBOL_GPL(devm_create_dev_dax);
int __dax_driver_register(struct dax_device_driver *dax_drv,
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index f4b635526345..54e528779877 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -13,6 +13,7 @@
#include <linux/uio.h>
#include <linux/dax.h>
#include <linux/fs.h>
+#include <linux/cacheinfo.h>
#include "dax-private.h"
/**
@@ -319,6 +320,11 @@ EXPORT_SYMBOL_GPL(dax_alive);
* that any fault handlers or operations that might have seen
* dax_alive(), have completed. Any operations that start after
* synchronize_srcu() has run will abort upon seeing !dax_alive().
+ *
+ * Note, because alloc_dax() returns an ERR_PTR() on error, callers
+ * typically store its result into a local variable in order to check
+ * the result. Therefore, care must be taken to populate the struct
+ * device dax_dev field make sure the dax_dev is not leaked.
*/
void kill_dax(struct dax_device *dax_dev)
{
@@ -446,6 +452,14 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops)
dev_t devt;
int minor;
+ /*
+ * Unavailable on architectures with virtually aliased data caches,
+ * except for device-dax (NULL operations pointer), which does
+ * not use aliased mappings from the kernel.
+ */
+ if (ops && cpu_dcache_is_aliasing())
+ return ERR_PTR(-EOPNOTSUPP);
+
if (WARN_ON_ONCE(ops && !ops->zero_page_range))
return ERR_PTR(-EINVAL);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8dcabf84d866..10c73af93d00 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2054,6 +2054,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
static struct mapped_device *alloc_dev(int minor)
{
int r, numa_node_id = dm_get_numa_node();
+ struct dax_device *dax_dev;
struct mapped_device *md;
void *old_md;
@@ -2122,15 +2123,15 @@ static struct mapped_device *alloc_dev(int minor)
md->disk->private_data = md;
sprintf(md->disk->disk_name, "dm-%d", minor);
- if (IS_ENABLED(CONFIG_FS_DAX)) {
- md->dax_dev = alloc_dax(md, &dm_dax_ops);
- if (IS_ERR(md->dax_dev)) {
- md->dax_dev = NULL;
+ dax_dev = alloc_dax(md, &dm_dax_ops);
+ if (IS_ERR(dax_dev)) {
+ if (PTR_ERR(dax_dev) != -EOPNOTSUPP)
goto bad;
- }
- set_dax_nocache(md->dax_dev);
- set_dax_nomc(md->dax_dev);
- if (dax_add_host(md->dax_dev, md->disk))
+ } else {
+ set_dax_nocache(dax_dev);
+ set_dax_nomc(dax_dev);
+ md->dax_dev = dax_dev;
+ if (dax_add_host(dax_dev, md->disk))
goto bad;
}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 4e8fdcb3f1c8..e9898457a7bd 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -560,18 +560,19 @@ static int pmem_attach_disk(struct device *dev,
dax_dev = alloc_dax(pmem, &pmem_dax_ops);
if (IS_ERR(dax_dev)) {
rc = PTR_ERR(dax_dev);
- goto out;
+ if (rc != -EOPNOTSUPP)
+ goto out;
+ } else {
+ set_dax_nocache(dax_dev);
+ set_dax_nomc(dax_dev);
+ if (is_nvdimm_sync(nd_region))
+ set_dax_synchronous(dax_dev);
+ pmem->dax_dev = dax_dev;
+ rc = dax_add_host(dax_dev, disk);
+ if (rc)
+ goto out_cleanup_dax;
+ dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
}
- set_dax_nocache(dax_dev);
- set_dax_nomc(dax_dev);
- if (is_nvdimm_sync(nd_region))
- set_dax_synchronous(dax_dev);
- rc = dax_add_host(dax_dev, disk);
- if (rc)
- goto out_cleanup_dax;
- dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
- pmem->dax_dev = dax_dev;
-
rc = device_add_disk(dev, disk, pmem_attribute_groups);
if (rc)
goto out_remove_host;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 4b7ecd4fd431..f363c1d51d9a 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -549,6 +549,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
int rc, i, j, num_of_segments;
struct dcssblk_dev_info *dev_info;
struct segment_info *seg_info, *temp;
+ struct dax_device *dax_dev;
char *local_buf;
unsigned long seg_byte_size;
@@ -677,13 +678,13 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
if (rc)
goto put_dev;
- dev_info->dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops);
- if (IS_ERR(dev_info->dax_dev)) {
- rc = PTR_ERR(dev_info->dax_dev);
- dev_info->dax_dev = NULL;
+ dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops);
+ if (IS_ERR(dax_dev)) {
+ rc = PTR_ERR(dax_dev);
goto put_dev;
}
- set_dax_synchronous(dev_info->dax_dev);
+ set_dax_synchronous(dax_dev);
+ dev_info->dax_dev = dax_dev;
rc = dax_add_host(dev_info->dax_dev, dev_info->gd);
if (rc)
goto out_dax;
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index 11c428f4c7cf..7815e9bea69a 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
#include <linux/module.h>
#include <asm/ctlreg.h>
#include <asm/chpid.h>
@@ -26,6 +27,7 @@
#include <asm/sclp.h>
#include <asm/numa.h>
#include <asm/facility.h>
+#include <asm/page-states.h>
#include "sclp.h"
@@ -340,16 +342,38 @@ static int sclp_mem_notifier(struct notifier_block *nb,
if (contains_standby_increment(start, start + size))
rc = -EPERM;
break;
- case MEM_ONLINE:
- case MEM_CANCEL_OFFLINE:
- break;
- case MEM_GOING_ONLINE:
+ case MEM_PREPARE_ONLINE:
+ /*
+ * Access the altmap_start_pfn and altmap_nr_pages fields
+ * within the struct memory_notify specifically when dealing
+ * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
+ *
+ * When altmap is in use, take the specified memory range
+ * online, which includes the altmap.
+ */
+ if (arg->altmap_nr_pages) {
+ start = PFN_PHYS(arg->altmap_start_pfn);
+ size += PFN_PHYS(arg->altmap_nr_pages);
+ }
rc = sclp_mem_change_state(start, size, 1);
+ if (rc || !arg->altmap_nr_pages)
+ break;
+ /*
+ * Set CMMA state to nodat here, since the struct page memory
+ * at the beginning of the memory block will not go through the
+ * buddy allocator later.
+ */
+ __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages);
break;
- case MEM_CANCEL_ONLINE:
- sclp_mem_change_state(start, size, 0);
- break;
- case MEM_OFFLINE:
+ case MEM_FINISH_OFFLINE:
+ /*
+ * When altmap is in use, take the specified memory range
+ * offline, which includes the altmap.
+ */
+ if (arg->altmap_nr_pages) {
+ start = PFN_PHYS(arg->altmap_start_pfn);
+ size += PFN_PHYS(arg->altmap_nr_pages);
+ }
sclp_mem_change_state(start, size, 0);
break;
default:
@@ -400,7 +424,9 @@ static void __init add_memory_merged(u16 rn)
if (!size)
goto skip_add;
for (addr = start; addr < start + size; addr += block_size)
- add_memory(0, addr, block_size, MHP_NONE);
+ add_memory(0, addr, block_size,
+ MACHINE_HAS_EDAT1 ?
+ MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
skip_add:
first_rn = rn;
num = 1;
diff --git a/fs/Kconfig b/fs/Kconfig
index 89fdbefd1075..713c8655fa0a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -60,7 +60,6 @@ endif # BLOCK
config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
- depends on !(ARM || MIPS || SPARC)
depends on ZONE_DEVICE || FS_DAX_LIMITED
select FS_IOMAP
select DAX
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 5f1be1da92ce..a28466c2da71 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -16,6 +16,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/highmem.h>
+#include <linux/cleanup.h>
#include <linux/uio.h>
#include "fuse_i.h"
@@ -795,8 +796,11 @@ static void virtio_fs_cleanup_dax(void *data)
put_dax(dax_dev);
}
+DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T))
+
static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
{
+ struct dax_device *dax_dev __free(cleanup_dax) = NULL;
struct virtio_shm_region cache_reg;
struct dev_pagemap *pgmap;
bool have_cache;
@@ -804,6 +808,12 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
if (!IS_ENABLED(CONFIG_FUSE_DAX))
return 0;
+ dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
+ if (IS_ERR(dax_dev)) {
+ int rc = PTR_ERR(dax_dev);
+ return rc == -EOPNOTSUPP ? 0 : rc;
+ }
+
/* Get cache region */
have_cache = virtio_get_shm_region(vdev, &cache_reg,
(u8)VIRTIO_FS_SHMCAP_ID_CACHE);
@@ -849,10 +859,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
__func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
- fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
- if (IS_ERR(fs->dax_dev))
- return PTR_ERR(fs->dax_dev);
-
+ fs->dax_dev = no_free_ptr(dax_dev);
return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
fs->dax_dev);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3f78ebbb795f..23fbab954c20 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1352,8 +1352,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
-static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
- struct pagemapread *pm)
+static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
{
pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)
@@ -1380,7 +1379,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
hole_end = end;
for (; addr < hole_end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1392,7 +1391,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
if (vma->vm_flags & VM_SOFTDIRTY)
pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1519,7 +1518,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
break;
if (pm->show_pfn) {
@@ -1547,7 +1546,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
pagemap_entry_t pme;
pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
break;
}
@@ -1597,7 +1596,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
return err;
if (pm->show_pfn && (flags & PM_PRESENT))
@@ -1807,7 +1806,7 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
swp = pte_to_swp_entry(pte);
if (is_pfn_swap_entry(swp) &&
- !PageAnon(pfn_swap_entry_to_page(swp)))
+ !folio_test_anon(pfn_swap_entry_folio(swp)))
categories |= PAGE_IS_FILE;
}
if (pte_swp_soft_dirty(pte))
@@ -1873,7 +1872,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
swp = pmd_to_swp_entry(pmd);
if (is_pfn_swap_entry(swp) &&
- !PageAnon(pfn_swap_entry_to_page(swp)))
+ !folio_test_anon(pfn_swap_entry_folio(swp)))
categories |= PAGE_IS_FILE;
}
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 959551ff9a95..60dcfafdc11a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
-/*
- * Start with fault_pending_wqh and fault_wqh so they're more likely
- * to be in the same cacheline.
- *
- * Locking order:
- * fd_wqh.lock
- * fault_pending_wqh.lock
- * fault_wqh.lock
- * event_wqh.lock
- *
- * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
- * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
- * also taken in IRQ context.
- */
-struct userfaultfd_ctx {
- /* waitqueue head for the pending (i.e. not read) userfaults */
- wait_queue_head_t fault_pending_wqh;
- /* waitqueue head for the userfaults */
- wait_queue_head_t fault_wqh;
- /* waitqueue head for the pseudo fd to wakeup poll/read */
- wait_queue_head_t fd_wqh;
- /* waitqueue head for events */
- wait_queue_head_t event_wqh;
- /* a refile sequence protected by fault_pending_wqh lock */
- seqcount_spinlock_t refile_seq;
- /* pseudo fd refcounting */
- refcount_t refcount;
- /* userfaultfd syscall flags */
- unsigned int flags;
- /* features requested from the userspace */
- unsigned int features;
- /* released */
- bool released;
- /* memory mappings are changing because of non-cooperative event */
- atomic_t mmap_changing;
- /* mm with one ore more vmas attached to this userfaultfd_ctx */
- struct mm_struct *mm;
-};
-
struct userfaultfd_fork_ctx {
struct userfaultfd_ctx *orig;
struct userfaultfd_ctx *new;
@@ -724,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->flags = octx->flags;
ctx->features = octx->features;
ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
+ down_write(&octx->map_changing_lock);
atomic_inc(&octx->mmap_changing);
+ up_write(&octx->map_changing_lock);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -776,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
vma_start_write(vma);
@@ -822,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
mmap_read_unlock(mm);
msg_init(&ewq.msg);
@@ -864,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -1748,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- flags);
+ ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1800,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1857,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return -EINVAL;
if (mmget_not_zero(ctx->mm)) {
- ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len, mode_wp,
- &ctx->mmap_changing);
+ ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1909,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len,
- &ctx->mmap_changing, flags);
+ ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
+ uffdio_continue.range.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1964,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
- uffdio_poison.range.len,
- &ctx->mmap_changing, 0);
+ ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
+ uffdio_poison.range.len, 0);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -2040,16 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
return -EINVAL;
if (mmget_not_zero(mm)) {
- mmap_read_lock(mm);
-
- /* Re-check after taking mmap_lock */
- if (likely(!atomic_read(&ctx->mmap_changing)))
- ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
- uffdio_move.len, uffdio_move.mode);
- else
- ret = -EINVAL;
-
- mmap_read_unlock(mm);
+ ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+ uffdio_move.len, uffdio_move.mode);
mmput(mm);
} else {
return -ESRCH;
@@ -2255,6 +2212,7 @@ static int new_userfaultfd(int flags)
ctx->flags = flags;
ctx->features = 0;
ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 129a3a759976..709830274b75 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -69,6 +69,7 @@
*
* - tlb_remove_page() / __tlb_remove_page()
* - tlb_remove_page_size() / __tlb_remove_page_size()
+ * - __tlb_remove_folio_pages()
*
* __tlb_remove_page_size() is the basic primitive that queues a page for
* freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
@@ -78,6 +79,11 @@
* tlb_remove_page() and tlb_remove_page_size() imply the call to
* tlb_flush_mmu() when required and has no return value.
*
+ * __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however,
+ * instead of removing a single page, remove the given number of consecutive
+ * pages that are all part of the same (large) folio: just like calling
+ * __tlb_remove_page() on each page individually.
+ *
* - tlb_change_page_size()
*
* call before __tlb_remove_page*() to set the current page-size; implies a
@@ -260,9 +266,10 @@ struct mmu_gather_batch {
*/
#define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH)
-extern bool __tlb_remove_page_size(struct mmu_gather *tlb,
- struct encoded_page *page,
- int page_size);
+extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+ bool delay_rmap, int page_size);
+bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
+ unsigned int nr_pages, bool delay_rmap);
#ifdef CONFIG_SMP
/*
@@ -462,13 +469,14 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
static inline void tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
{
- if (__tlb_remove_page_size(tlb, encode_page(page, 0), page_size))
+ if (__tlb_remove_page_size(tlb, page, false, page_size))
tlb_flush_mmu(tlb);
}
-static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags)
+static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb,
+ struct page *page, bool delay_rmap)
{
- return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE);
+ return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE);
}
/* tlb_remove_page
@@ -592,7 +600,9 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
}
#ifndef __tlb_remove_tlb_entry
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
+static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address)
+{
+}
#endif
/**
@@ -608,6 +618,26 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
__tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
+/**
+ * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for
+ * later tlb invalidation.
+ *
+ * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple
+ * consecutive ptes instead of only a single one.
+ */
+static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb,
+ pte_t *ptep, unsigned int nr, unsigned long address)
+{
+ tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr);
+ for (;;) {
+ __tlb_remove_tlb_entry(tlb, ptep, address);
+ if (--nr == 0)
+ break;
+ ptep++;
+ address += PAGE_SIZE;
+ }
+}
+
#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
do { \
unsigned long _sz = huge_page_size(h); \
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index d504eb4b49ab..2cb15fe4fe12 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -138,4 +138,10 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level)
#define use_arch_cache_info() (false)
#endif
+#ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING
+#define cpu_dcache_is_aliasing() false
+#else
+#include <asm/cachetype.h>
+#endif
+
#endif /* _LINUX_CACHEINFO_H */
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 63873b93deaa..9db877506ea8 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -6,12 +6,8 @@
#include <linux/types.h>
#include <linux/numa.h>
-/*
- * There is always at least global CMA area and a few optional
- * areas configured in kernel .config.
- */
#ifdef CONFIG_CMA_AREAS
-#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS)
+#define MAX_CMA_AREAS CONFIG_CMA_AREAS
#endif
#define CMA_MAX_NAME 64
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b463502b16e1..9d3e3327af4c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -63,6 +63,8 @@ void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
bool dax_synchronous(struct dax_device *dax_dev);
+void set_dax_nocache(struct dax_device *dax_dev);
+void set_dax_nomc(struct dax_device *dax_dev);
void set_dax_synchronous(struct dax_device *dax_dev);
size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i);
@@ -86,11 +88,7 @@ static inline void *dax_holder(struct dax_device *dax_dev)
static inline struct dax_device *alloc_dax(void *private,
const struct dax_operations *ops)
{
- /*
- * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
- * NULL is an error or expected.
- */
- return NULL;
+ return ERR_PTR(-EOPNOTSUPP);
}
static inline void put_dax(struct dax_device *dax_dev)
{
@@ -109,6 +107,12 @@ static inline bool dax_synchronous(struct dax_device *dax_dev)
{
return true;
}
+static inline void set_dax_nocache(struct dax_device *dax_dev)
+{
+}
+static inline void set_dax_nomc(struct dax_device *dax_dev)
+{
+}
static inline void set_dax_synchronous(struct dax_device *dax_dev)
{
}
@@ -124,9 +128,6 @@ static inline size_t dax_recovery_write(struct dax_device *dax_dev,
}
#endif
-void set_dax_nocache(struct dax_device *dax_dev);
-void set_dax_nomc(struct dax_device *dax_dev);
-
struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index c74f47711f0b..57da15e7429c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -692,6 +692,11 @@ extern struct efi {
extern struct mm_struct efi_mm;
+static inline bool mm_is_efi(struct mm_struct *mm)
+{
+ return IS_ENABLED(CONFIG_EFI) && mm == &efi_mm;
+}
+
static inline int
efi_guidcmp (efi_guid_t left, efi_guid_t right)
{
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index de292a007138..09e22091f1b0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -334,7 +334,7 @@ void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
void page_alloc_init_late(void);
-void setup_pcp_cacheinfo(void);
+void setup_pcp_cacheinfo(unsigned int cpu);
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 451c1dff0e87..00341b56d291 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -439,6 +439,13 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len)
kunmap_local(addr);
}
+/**
+ * memcpy_from_folio - Copy a range of bytes from a folio.
+ * @to: The memory to copy to.
+ * @folio: The folio to read from.
+ * @offset: The first byte in the folio to read.
+ * @len: The number of bytes to copy.
+ */
static inline void memcpy_from_folio(char *to, struct folio *folio,
size_t offset, size_t len)
{
@@ -460,6 +467,13 @@ static inline void memcpy_from_folio(char *to, struct folio *folio,
} while (len > 0);
}
+/**
+ * memcpy_to_folio - Copy a range of bytes to a folio.
+ * @folio: The folio to write to.
+ * @offset: The first byte in the folio to store to.
+ * @from: The memory to copy from.
+ * @len: The number of bytes to copy.
+ */
static inline void memcpy_to_folio(struct folio *folio, size_t offset,
const char *from, size_t len)
{
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 7675a48a0701..792b67ceb631 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -24,6 +24,8 @@ enum lru_status {
LRU_SKIP, /* item cannot be locked, skip */
LRU_RETRY, /* item not freeable. May drop the lock
internally, but has to return locked. */
+ LRU_STOP, /* stop lru list walking. May drop the lock
+ internally, but has to return locked. */
};
struct list_lru_one {
@@ -62,8 +64,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
#define list_lru_init(lru) \
__list_lru_init((lru), false, NULL, NULL)
-#define list_lru_init_key(lru, key) \
- __list_lru_init((lru), false, (key), NULL)
#define list_lru_init_memcg(lru, shrinker) \
__list_lru_init((lru), true, NULL, shrinker)
@@ -170,22 +170,6 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
struct list_head *head);
-/**
- * list_lru_putback: undo list_lru_isolate
- * @lru: the lru pointer.
- * @item: the item to put back.
- * @nid: the node id of the sublist to put the item back to.
- * @memcg: the cgroup of the sublist to put the item back to.
- *
- * Put back an isolated item into its original LRU. Note that unlike
- * list_lru_add, this does not increment the node LRU count (as
- * list_lru_isolate does not originally decrement this count).
- *
- * Since we might have dropped the LRU lock in between, recompute list_lru_one
- * from the node's id and memcg.
- */
-void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg);
typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 20ff87f8e001..4e4caeaea404 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -14,6 +14,7 @@
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
+#include <linux/kernel.h>
#include <linux/page_counter.h>
#include <linux/vmpressure.h>
#include <linux/eventfd.h>
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f53cfdaaaa41..939a16bd5cea 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -96,8 +96,17 @@ int set_memory_block_size_order(unsigned int order);
#define MEM_GOING_ONLINE (1<<3)
#define MEM_CANCEL_ONLINE (1<<4)
#define MEM_CANCEL_OFFLINE (1<<5)
+#define MEM_PREPARE_ONLINE (1<<6)
+#define MEM_FINISH_OFFLINE (1<<7)
struct memory_notify {
+ /*
+ * The altmap_start_pfn and altmap_nr_pages fields are designated for
+ * specifying the altmap range and are exclusively intended for use in
+ * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
+ */
+ unsigned long altmap_start_pfn;
+ unsigned long altmap_nr_pages;
unsigned long start_pfn;
unsigned long nr_pages;
int status_change_nid_normal;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7d2076583494..7a9ff464608d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -106,6 +106,22 @@ typedef int __bitwise mhp_t;
* implies the node id (nid).
*/
#define MHP_NID_IS_MGID ((__force mhp_t)BIT(2))
+/*
+ * The hotplugged memory is completely inaccessible while the memory is
+ * offline. The memory provider will handle MEM_PREPARE_ONLINE /
+ * MEM_FINISH_OFFLINE notifications and make the memory accessible.
+ *
+ * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY,
+ * because the altmap cannot be written (e.g., poisoned) when adding
+ * memory -- before it is set online.
+ *
+ * This allows for adding memory with an altmap that is not currently
+ * made available by a hypervisor. When onlining that memory, the
+ * hypervisor can be instructed to make that memory available, and
+ * the onlining phase will not require any memory allocations, which is
+ * helpful in low-memory situations.
+ */
+#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3))
/*
* Extended parameters for memory hotplug:
@@ -121,6 +137,7 @@ struct mhp_params {
bool mhp_range_allowed(u64 start, u64 size, bool need_mapping);
struct range mhp_get_pluggable_range(bool need_mapping);
+bool mhp_supports_memmap_on_memory(void);
/*
* Zone resizing functions
@@ -154,7 +171,7 @@ extern void adjust_present_page_count(struct page *page,
long nr_pages);
/* VM interface that may be used by firmware interface */
extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone);
+ struct zone *zone, bool mhp_off_inaccessible);
extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group);
@@ -262,6 +279,11 @@ static inline bool movable_node_is_enabled(void)
return false;
}
+static inline bool mhp_supports_memmap_on_memory(void)
+{
+ return false;
+}
+
static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 744c830f4b13..3f7143ade32c 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -25,6 +25,7 @@ struct vmem_altmap {
unsigned long free;
unsigned long align;
unsigned long alloc;
+ bool inaccessible;
};
/*
@@ -108,7 +109,7 @@ struct dev_pagemap_ops {
* @altmap: pre-allocated/reserved memory for vmemmap allocations
* @ref: reference count that pins the devm_memremap_pages() mapping
* @done: completion for @ref
- * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @type: memory type: see MEMORY_* above in memremap.h
* @flags: PGMAP_* flags to specify defailed behavior
* @vmemmap_shift: structural definition of how the vmemmap page metadata
* is populated, specifically the metadata page order.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..6f4825d82965 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -781,6 +781,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
return NULL;
}
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+ mmap_assert_locked(vma->vm_mm);
+}
+
static inline void release_fault_lock(struct vm_fault *vmf)
{
mmap_read_unlock(vmf->vma->vm_mm);
@@ -2595,19 +2600,19 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member)
mm_trace_rss_stat(mm, member);
}
-/* Optimized variant when page is already known not to be PageAnon */
-static inline int mm_counter_file(struct page *page)
+/* Optimized variant when folio is already known not to be anon */
+static inline int mm_counter_file(struct folio *folio)
{
- if (PageSwapBacked(page))
+ if (folio_test_swapbacked(folio))
return MM_SHMEMPAGES;
return MM_FILEPAGES;
}
-static inline int mm_counter(struct page *page)
+static inline int mm_counter(struct folio *folio)
{
- if (PageAnon(page))
+ if (folio_test_anon(folio))
return MM_ANONPAGES;
- return mm_counter_file(page);
+ return mm_counter_file(folio);
}
static inline unsigned long get_mm_rss(struct mm_struct *mm)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e..a7223ba3ea1e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -210,8 +210,8 @@ struct page {
*
* An 'encoded_page' pointer is a pointer to a regular 'struct page', but
* with the low bits of the pointer indicating extra context-dependent
- * information. Not super-common, but happens in mmu_gather and mlock
- * handling, and this acts as a type system check on that use.
+ * information. Only used in mmu_gather handling, and this acts as a type
+ * system check on that use.
*
* We only really have two guaranteed bits in general, although you could
* play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
@@ -220,21 +220,46 @@ struct page {
* Use the supplied helper functions to endcode/decode the pointer and bits.
*/
struct encoded_page;
-#define ENCODE_PAGE_BITS 3ul
+
+#define ENCODED_PAGE_BITS 3ul
+
+/* Perform rmap removal after we have flushed the TLB. */
+#define ENCODED_PAGE_BIT_DELAY_RMAP 1ul
+
+/*
+ * The next item in an encoded_page array is the "nr_pages" argument, specifying
+ * the number of consecutive pages starting from this page, that all belong to
+ * the same folio. For example, "nr_pages" corresponds to the number of folio
+ * references that must be dropped. If this bit is not set, "nr_pages" is
+ * implicitly 1.
+ */
+#define ENCODED_PAGE_BIT_NR_PAGES_NEXT 2ul
+
static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags)
{
- BUILD_BUG_ON(flags > ENCODE_PAGE_BITS);
+ BUILD_BUG_ON(flags > ENCODED_PAGE_BITS);
return (struct encoded_page *)(flags | (unsigned long)page);
}
static inline unsigned long encoded_page_flags(struct encoded_page *page)
{
- return ENCODE_PAGE_BITS & (unsigned long)page;
+ return ENCODED_PAGE_BITS & (unsigned long)page;
}
static inline struct page *encoded_page_ptr(struct encoded_page *page)
{
- return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page);
+ return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page);
+}
+
+static __always_inline struct encoded_page *encode_nr_pages(unsigned long nr)
+{
+ VM_WARN_ON_ONCE((nr << 2) >> 2 != nr);
+ return (struct encoded_page *)(nr << 2);
+}
+
+static __always_inline unsigned long encoded_nr_pages(struct encoded_page *page)
+{
+ return ((unsigned long)page) >> 2;
}
/*
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index f2b7a3f04099..bbaec80c78c5 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -11,7 +11,7 @@
#endif
#ifndef leave_mm
-static inline void leave_mm(int cpu) { }
+static inline void leave_mm(void) { }
#endif
/*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a497f189d988..633812a1d220 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -464,7 +464,7 @@ enum {
#define NR_BLOOM_FILTERS 2
struct lru_gen_mm_state {
- /* set to max_seq after each iteration */
+ /* synced with max_seq after each iteration */
unsigned long seq;
/* where the current iteration continues after */
struct list_head *head;
@@ -479,8 +479,8 @@ struct lru_gen_mm_state {
struct lru_gen_mm_walk {
/* the lruvec under reclaim */
struct lruvec *lruvec;
- /* unstable max_seq from lru_gen_folio */
- unsigned long max_seq;
+ /* max_seq from lru_gen_folio: can be out of date */
+ unsigned long seq;
/* the next address within an mm to scan */
unsigned long next_addr;
/* to batch promoted pages */
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index c141ea9a95ef..8cd858d912c4 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -4,7 +4,7 @@
#include <linux/atomic.h>
#include <linux/cache.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
#include <asm/page.h>
struct page_counter {
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index f6d0e3513948..a36cf4e124b0 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,15 +212,37 @@ static inline int pmd_dirty(pmd_t pmd)
#define arch_flush_lazy_mmu_mode() do {} while (0)
#endif
-#ifndef set_ptes
+#ifndef pte_batch_hint
+/**
+ * pte_batch_hint - Number of pages that can be added to batch without scanning.
+ * @ptep: Page table pointer for the entry.
+ * @pte: Page table entry.
+ *
+ * Some architectures know that a set of contiguous ptes all map the same
+ * contiguous memory with the same permissions. In this case, it can provide a
+ * hint to aid pte batching without the core code needing to scan every pte.
+ *
+ * An architecture implementation may ignore the PTE accessed state. Further,
+ * the dirty state must apply atomically to all the PTEs described by the hint.
+ *
+ * May be overridden by the architecture, else pte_batch_hint is always 1.
+ */
+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+ return 1;
+}
+#endif
-#ifndef pte_next_pfn
-static inline pte_t pte_next_pfn(pte_t pte)
+#ifndef pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
- return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+ return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#endif
+#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)
+
+#ifndef set_ptes
/**
* set_ptes - Map consecutive pages to a contiguous range of addresses.
* @mm: Address space to map the pages into.
@@ -229,6 +251,10 @@ static inline pte_t pte_next_pfn(pte_t pte)
* @pte: Page table entry for the first page.
* @nr: Number of pages to map.
*
+ * When nr==1, initial state of pte may be present or not present, and new state
+ * may be present or not present. When nr>1, initial state of all ptes must be
+ * not present, and new state must be present.
+ *
* May be overridden by the architecture, or the architecture can define
* set_pte() and PFN_PTE_SHIFT.
*
@@ -580,6 +606,76 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
}
#endif
+#ifndef get_and_clear_full_ptes
+/**
+ * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
+ * the same folio, collecting dirty/accessed bits.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
+ * returned PTE.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+ pte_t pte, tmp_pte;
+
+ pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+ while (--nr) {
+ ptep++;
+ addr += PAGE_SIZE;
+ tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+ if (pte_dirty(tmp_pte))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tmp_pte))
+ pte = pte_mkyoung(pte);
+ }
+ return pte;
+}
+#endif
+
+#ifndef clear_full_ptes
+/**
+ * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
+ * folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ for (;;) {
+ ptep_get_and_clear_full(mm, addr, ptep, full);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+#endif
/*
* If two threads concurrently fault at the same page, the thread that
@@ -650,6 +746,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
}
#endif
+#ifndef wrprotect_ptes
+/**
+ * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
+ * folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to write-protect.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_set_wrprotect().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ for (;;) {
+ ptep_set_wrprotect(mm, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+#endif
+
/*
* On some architectures hardware does not set page access bit when accessing
* memory page, it is responsibility of software setting this bit. It brings
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index 2a3a95586425..8dbd51ea8626 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -18,6 +18,16 @@ struct ptdump_state {
const struct ptdump_range *range;
};
+bool ptdump_walk_pgd_level_core(struct seq_file *m,
+ struct mm_struct *mm, pgd_t *pgd,
+ bool checkwx, bool dmesg);
void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd);
+bool ptdump_check_wx(void);
+
+static inline void debug_checkwx(void)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_WX))
+ ptdump_check_wx();
+}
#endif /* _LINUX_PTDUMP_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..998861865b84 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1259,6 +1259,7 @@ struct task_struct {
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
+ u8 il_weight;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
@@ -1623,15 +1624,15 @@ extern struct pid *cad_pid;
#define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* Dumped core */
#define PF_SIGNALED 0x00000400 /* Killed by a signal */
-#define PF_MEMALLOC 0x00000800 /* Allocating memory */
+#define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */
#define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
#define PF__HOLE__00010000 0x00010000
#define PF_KSWAPD 0x00020000 /* I am kswapd */
-#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
-#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
+#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
+#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to,
* I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
@@ -1641,7 +1642,8 @@ extern struct pid *cad_pid;
#define PF__HOLE__02000000 0x02000000
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
-#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
+#define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning.
+ * See memalloc_pin_save() */
#define PF__HOLE__20000000 0x20000000
#define PF__HOLE__40000000 0x40000000
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 9a19f1b42f64..7a4066d22883 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -315,7 +315,8 @@ static inline void might_alloc(gfp_t gfp_mask)
* point of view. Use memalloc_noio_restore to end the scope with flags
* returned by this function.
*
- * This function is safe to be used from any context.
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_noio_restore.
*/
static inline unsigned int memalloc_noio_save(void)
{
@@ -346,7 +347,8 @@ static inline void memalloc_noio_restore(unsigned int flags)
* point of view. Use memalloc_nofs_restore to end the scope with flags
* returned by this function.
*
- * This function is safe to be used from any context.
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_nofs_restore.
*/
static inline unsigned int memalloc_nofs_save(void)
{
@@ -368,6 +370,29 @@ static inline void memalloc_nofs_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
}
+/**
+ * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
+ *
+ * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
+ * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
+ * prevents entering reclaim and allows access to all memory reserves. This
+ * should only be used when the caller guarantees the allocation will allow more
+ * memory to be freed very shortly, i.e. it needs to allocate some memory in
+ * the process of freeing memory, and cannot reclaim due to potential recursion.
+ *
+ * Users of this scope have to be extremely careful to not deplete the reserves
+ * completely and implement a throttling mechanism which controls the
+ * consumption of the reserve based on the amount of freed memory. Usage of a
+ * pre-allocated pool (e.g. mempool) should be always considered before using
+ * this scope.
+ *
+ * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
+ *
+ * Context: This function should not be used in an interrupt context as that one
+ * does not give PF_MEMALLOC access to reserves.
+ * See __gfp_pfmemalloc_flags().
+ * Return: The saved flags to be passed to memalloc_noreclaim_restore.
+ */
static inline unsigned int memalloc_noreclaim_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC;
@@ -375,11 +400,29 @@ static inline unsigned int memalloc_noreclaim_save(void)
return flags;
}
+/**
+ * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
+ * function. Always make sure that the given flags is the return value from the
+ * pairing memalloc_noreclaim_save call.
+ */
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
+/**
+ * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
+ *
+ * This function marks the beginning of the ~__GFP_MOVABLE allocation scope.
+ * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
+ * will constraint the allocations to zones that allow long term pinning, i.e.
+ * not ZONE_MOVABLE zones.
+ *
+ * Return: The saved flags to be passed to memalloc_pin_restore.
+ */
static inline unsigned int memalloc_pin_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_PIN;
@@ -388,6 +431,14 @@ static inline unsigned int memalloc_pin_save(void)
return flags;
}
+/**
+ * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function.
+ * Always make sure that the given flags is the return value from the pairing
+ * memalloc_pin_save call.
+ */
static inline void memalloc_pin_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index bff1e8d97de0..48b700ba1d18 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -468,6 +468,19 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
return p;
}
+static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
+{
+ struct folio *folio = pfn_folio(swp_offset_pfn(entry));
+
+ /*
+ * Any use of migration entries may only occur while the
+ * corresponding folio is locked
+ */
+ BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio));
+
+ return folio;
+}
+
/*
* A pfn swap entry is a special type of swap entry that always has a pfn stored
* in the swap offset. They are used to represent unaddressable device memory
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e4056547fbe6..05d59f74fc88 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -36,6 +36,52 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ *
+ * Locking order:
+ * fd_wqh.lock
+ * fault_pending_wqh.lock
+ * fault_wqh.lock
+ * event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
+ */
+struct userfaultfd_ctx {
+ /* waitqueue head for the pending (i.e. not read) userfaults */
+ wait_queue_head_t fault_pending_wqh;
+ /* waitqueue head for the userfaults */
+ wait_queue_head_t fault_wqh;
+ /* waitqueue head for the pseudo fd to wakeup poll/read */
+ wait_queue_head_t fd_wqh;
+ /* waitqueue head for events */
+ wait_queue_head_t event_wqh;
+ /* a refile sequence protected by fault_pending_wqh lock */
+ seqcount_spinlock_t refile_seq;
+ /* pseudo fd refcounting */
+ refcount_t refcount;
+ /* userfaultfd syscall flags */
+ unsigned int flags;
+ /* features requested from the userspace */
+ unsigned int features;
+ /* released */
+ bool released;
+ /*
+ * Prevents userfaultfd operations (fill/move/wp) from happening while
+ * some non-cooperative event(s) is taking place. Increments are done
+ * in write-mode. Whereas, userfaultfd operations, which includes
+ * reading mmap_changing, is done under read-mode.
+ */
+ struct rw_semaphore map_changing_lock;
+ /* memory mappings are changing because of non-cooperative event */
+ atomic_t mmap_changing;
+ /* mm with one ore more vmas attached to this userfaultfd_ctx */
+ struct mm_struct *mm;
+};
+
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
/* A combined operation mode + behavior flags. */
@@ -74,31 +120,26 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
unsigned long dst_addr, struct page *page,
bool newly_allocated, uffd_flags_t flags);
-extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, uffd_flags_t flags);
-extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+ uffd_flags_t flags);
+extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
- unsigned long len,
- atomic_t *mmap_changing);
-extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags);
-extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags);
-extern int mwriteprotect_range(struct mm_struct *dst_mm,
- unsigned long start, unsigned long len,
- bool enable_wp, atomic_t *mmap_changing);
+ unsigned long len);
+extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long len, uffd_flags_t flags);
+extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags);
+extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_wp);
extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
/* move_pages */
void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
- unsigned long dst_start, unsigned long src_start,
- unsigned long len, __u64 flags);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long src_start, unsigned long len, __u64 flags);
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma,
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 0b709f5bc65f..341aea490070 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -29,8 +29,8 @@ struct zswap_lruvec_state {
bool zswap_store(struct folio *folio);
bool zswap_load(struct folio *folio);
-void zswap_invalidate(int type, pgoff_t offset);
-void zswap_swapon(int type);
+void zswap_invalidate(swp_entry_t swp);
+int zswap_swapon(int type, unsigned long nr_pages);
void zswap_swapoff(int type);
void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
void zswap_lruvec_state_init(struct lruvec *lruvec);
@@ -50,8 +50,11 @@ static inline bool zswap_load(struct folio *folio)
return false;
}
-static inline void zswap_invalidate(int type, pgoff_t offset) {}
-static inline void zswap_swapon(int type) {}
+static inline void zswap_invalidate(swp_entry_t swp) {}
+static inline int zswap_swapon(int type, unsigned long nr_pages)
+{
+ return 0;
+}
static inline void zswap_swapoff(int type) {}
static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 2b2a975efd20..d05759d18538 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -78,10 +78,10 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_fast_isolate_freepage
#ifdef CONFIG_COMPACTION
TRACE_EVENT(mm_compaction_migratepages,
- TP_PROTO(struct compact_control *cc,
+ TP_PROTO(unsigned int nr_migratepages,
unsigned int nr_succeeded),
- TP_ARGS(cc, nr_succeeded),
+ TP_ARGS(nr_migratepages, nr_succeeded),
TP_STRUCT__entry(
__field(unsigned long, nr_migrated)
@@ -90,7 +90,7 @@ TRACE_EVENT(mm_compaction_migratepages,
TP_fast_assign(
__entry->nr_migrated = nr_succeeded;
- __entry->nr_failed = cc->nr_migratepages - nr_succeeded;
+ __entry->nr_failed = nr_migratepages - nr_succeeded;
),
TP_printk("nr_migrated=%lu nr_failed=%lu",
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index a8963f7ef4c2..1f9bb10d1a47 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -23,6 +23,7 @@ enum {
MPOL_INTERLEAVE,
MPOL_LOCAL,
MPOL_PREFERRED_MANY,
+ MPOL_WEIGHTED_INTERLEAVE,
MPOL_MAX, /* always last member of enum */
};
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..749a9f8d2c9b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -99,6 +99,7 @@
#include <linux/init_syscalls.h>
#include <linux/stackdepot.h>
#include <linux/randomize_kstack.h>
+#include <linux/ptdump.h>
#include <net/net_namespace.h>
#include <asm/io.h>
@@ -1408,6 +1409,7 @@ static void mark_readonly(void)
*/
rcu_barrier();
mark_rodata_ro();
+ debug_checkwx();
rodata_test();
} else
pr_info("Kernel memory protection disabled.\n");
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index f005c66f378c..055da410ac71 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -37,12 +37,6 @@
#define pr_fmt(fmt) "cma: " fmt
-#ifdef CONFIG_CMA_DEBUG
-#ifndef DEBUG
-# define DEBUG
-#endif
-#endif
-
#include <asm/page.h>
#include <linux/memblock.h>
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 929e98c62965..e4834d23e1d1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
dec_mm_counter(mm, MM_ANONPAGES);
if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(old_page));
+ dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 6f241bb38799..82fb5195c235 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -1307,8 +1307,8 @@ static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
}
/*
- * mas_node_count() - Check if enough nodes are allocated and request more if
- * there is not enough nodes.
+ * mas_node_count_gfp() - Check if enough nodes are allocated and request more
+ * if there is not enough nodes.
* @mas: The maple state
* @count: The number of nodes needed
* @gfp: the gfp flags
@@ -2271,8 +2271,6 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast)
struct ma_state l_tmp = *mast->orig_l;
unsigned char depth = 0;
- r_tmp = *mast->orig_r;
- l_tmp = *mast->orig_l;
do {
mas_ascend(mast->orig_r);
mas_ascend(mast->orig_l);
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index e77d4856442c..ebe2af2e072d 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -423,6 +423,59 @@ static noinline void check_cmpxchg(struct xarray *xa)
XA_BUG_ON(xa, !xa_empty(xa));
}
+static noinline void check_cmpxchg_order(struct xarray *xa)
+{
+#ifdef CONFIG_XARRAY_MULTI
+ void *FIVE = xa_mk_value(5);
+ unsigned int i, order = 3;
+
+ XA_BUG_ON(xa, xa_store_order(xa, 0, order, FIVE, GFP_KERNEL));
+
+ /* Check entry FIVE has the order saved */
+ XA_BUG_ON(xa, xa_get_order(xa, xa_to_value(FIVE)) != order);
+
+ /* Check all the tied indexes have the same entry and order */
+ for (i = 0; i < (1 << order); i++) {
+ XA_BUG_ON(xa, xa_load(xa, i) != FIVE);
+ XA_BUG_ON(xa, xa_get_order(xa, i) != order);
+ }
+
+ /* Ensure that nothing is stored at index '1 << order' */
+ XA_BUG_ON(xa, xa_load(xa, 1 << order) != NULL);
+
+ /*
+ * Additionally, keep the node information and the order at
+ * '1 << order'
+ */
+ XA_BUG_ON(xa, xa_store_order(xa, 1 << order, order, FIVE, GFP_KERNEL));
+ for (i = (1 << order); i < (1 << order) + (1 << order) - 1; i++) {
+ XA_BUG_ON(xa, xa_load(xa, i) != FIVE);
+ XA_BUG_ON(xa, xa_get_order(xa, i) != order);
+ }
+
+ /* Conditionally replace FIVE entry at index '0' with NULL */
+ XA_BUG_ON(xa, xa_cmpxchg(xa, 0, FIVE, NULL, GFP_KERNEL) != FIVE);
+
+ /* Verify the order is lost at FIVE (and old) entries */
+ XA_BUG_ON(xa, xa_get_order(xa, xa_to_value(FIVE)) != 0);
+
+ /* Verify the order and entries are lost in all the tied indexes */
+ for (i = 0; i < (1 << order); i++) {
+ XA_BUG_ON(xa, xa_load(xa, i) != NULL);
+ XA_BUG_ON(xa, xa_get_order(xa, i) != 0);
+ }
+
+ /* Verify node and order are kept at '1 << order' */
+ for (i = (1 << order); i < (1 << order) + (1 << order) - 1; i++) {
+ XA_BUG_ON(xa, xa_load(xa, i) != FIVE);
+ XA_BUG_ON(xa, xa_get_order(xa, i) != order);
+ }
+
+ xa_store_order(xa, 0, BITS_PER_LONG - 1, NULL, GFP_KERNEL);
+ XA_BUG_ON(xa, !xa_empty(xa));
+#endif
+}
+
static noinline void check_reserve(struct xarray *xa)
{
void *entry;
@@ -674,6 +727,181 @@ static noinline void check_multi_store(struct xarray *xa)
#endif
}
+#ifdef CONFIG_XARRAY_MULTI
+/* mimics page cache __filemap_add_folio() */
+static noinline void check_xa_multi_store_adv_add(struct xarray *xa,
+ unsigned long index,
+ unsigned int order,
+ void *p)
+{
+ XA_STATE(xas, xa, index);
+ unsigned int nrpages = 1UL << order;
+
+ /* users are responsible for index alignemnt to the order when adding */
+ XA_BUG_ON(xa, index & (nrpages - 1));
+
+ xas_set_order(&xas, index, order);
+
+ do {
+ xas_lock_irq(&xas);
+
+ xas_store(&xas, p);
+ XA_BUG_ON(xa, xas_error(&xas));
+ XA_BUG_ON(xa, xa_load(xa, index) != p);
+
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ XA_BUG_ON(xa, xas_error(&xas));
+}
+
+/* mimics page_cache_delete() */
+static noinline void check_xa_multi_store_adv_del_entry(struct xarray *xa,
+ unsigned long index,
+ unsigned int order)
+{
+ XA_STATE(xas, xa, index);
+
+ xas_set_order(&xas, index, order);
+ xas_store(&xas, NULL);
+ xas_init_marks(&xas);
+}
+
+static noinline void check_xa_multi_store_adv_delete(struct xarray *xa,
+ unsigned long index,
+ unsigned int order)
+{
+ xa_lock_irq(xa);
+ check_xa_multi_store_adv_del_entry(xa, index, order);
+ xa_unlock_irq(xa);
+}
+
+/* mimics page cache filemap_get_entry() */
+static noinline void *test_get_entry(struct xarray *xa, unsigned long index)
+{
+ XA_STATE(xas, xa, index);
+ void *p;
+ static unsigned int loops = 0;
+
+ rcu_read_lock();
+repeat:
+ xas_reset(&xas);
+ p = xas_load(&xas);
+ if (xas_retry(&xas, p))
+ goto repeat;
+ rcu_read_unlock();
+
+ /*
+ * This is not part of the page cache, this selftest is pretty
+ * aggressive and does not want to trust the xarray API but rather
+ * test it, and for order 20 (4 GiB block size) we can loop over
+ * over a million entries which can cause a soft lockup. Page cache
+ * APIs won't be stupid, proper page cache APIs loop over the proper
+ * order so when using a larger order we skip shared entries.
+ */
+ if (++loops % XA_CHECK_SCHED == 0)
+ schedule();
+
+ return p;
+}
+
+static unsigned long some_val = 0xdeadbeef;
+static unsigned long some_val_2 = 0xdeaddead;
+
+/* mimics the page cache usage */
+static noinline void check_xa_multi_store_adv(struct xarray *xa,
+ unsigned long pos,
+ unsigned int order)
+{
+ unsigned int nrpages = 1UL << order;
+ unsigned long index, base, next_index, next_next_index;
+ unsigned int i;
+
+ index = pos >> PAGE_SHIFT;
+ base = round_down(index, nrpages);
+ next_index = round_down(base + nrpages, nrpages);
+ next_next_index = round_down(next_index + nrpages, nrpages);
+
+ check_xa_multi_store_adv_add(xa, base, order, &some_val);
+
+ for (i = 0; i < nrpages; i++)
+ XA_BUG_ON(xa, test_get_entry(xa, base + i) != &some_val);
+
+ XA_BUG_ON(xa, test_get_entry(xa, next_index) != NULL);
+
+ /* Use order 0 for the next item */
+ check_xa_multi_store_adv_add(xa, next_index, 0, &some_val_2);
+ XA_BUG_ON(xa, test_get_entry(xa, next_index) != &some_val_2);
+
+ /* Remove the next item */
+ check_xa_multi_store_adv_delete(xa, next_index, 0);
+
+ /* Now use order for a new pointer */
+ check_xa_multi_store_adv_add(xa, next_index, order, &some_val_2);
+
+ for (i = 0; i < nrpages; i++)
+ XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != &some_val_2);
+
+ check_xa_multi_store_adv_delete(xa, next_index, order);
+ check_xa_multi_store_adv_delete(xa, base, order);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ /* starting fresh again */
+
+ /* let's test some holes now */
+
+ /* hole at base and next_next */
+ check_xa_multi_store_adv_add(xa, next_index, order, &some_val_2);
+
+ for (i = 0; i < nrpages; i++)
+ XA_BUG_ON(xa, test_get_entry(xa, base + i) != NULL);
+
+ for (i = 0; i < nrpages; i++)
+ XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != &some_val_2);
+
+ for (i = 0; i < nrpages; i++)
+ XA_BUG_ON(xa, test_get_entry(xa, next_next_index + i) != NULL);
+
+ check_xa_multi_store_adv_delete(xa, next_index, order);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ /* hole at base and next */
+
+ check_xa_multi_store_adv_add(xa, next_next_index, order, &some_val_2);
+
+ for (i = 0; i < nrpages; i++)
+ XA_BUG_ON(xa, test_get_entry(xa, base + i) != NULL);
+
+ for (i = 0; i < nrpages; i++)
+ XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != NULL);
+
+ for (i = 0; i < nrpages; i++)
+ XA_BUG_ON(xa, test_get_entry(xa, next_next_index + i) != &some_val_2);
+
+ check_xa_multi_store_adv_delete(xa, next_next_index, order);
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+#endif
+
+static noinline void check_multi_store_advanced(struct xarray *xa)
+{
+#ifdef CONFIG_XARRAY_MULTI
+ unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1;
+ unsigned long end = ULONG_MAX/2;
+ unsigned long pos, i;
+
+ /*
+ * About 117 million tests below.
+ */
+ for (pos = 7; pos < end; pos = (pos * pos) + 564) {
+ for (i = 0; i < max_order; i++) {
+ check_xa_multi_store_adv(xa, pos, i);
+ check_xa_multi_store_adv(xa, pos + 157, i);
+ }
+ }
+#endif
+}
+
static noinline void check_xa_alloc_1(struct xarray *xa, unsigned int base)
{
int i;
@@ -1801,9 +2029,11 @@ static int xarray_checks(void)
check_xas_erase(&array);
check_insert(&array);
check_cmpxchg(&array);
+ check_cmpxchg_order(&array);
check_reserve(&array);
check_reserve(&xa0);
check_multi_store(&array);
+ check_multi_store_advanced(&array);
check_get_order(&array);
check_xa_alloc();
check_find(&array);
diff --git a/mm/Kconfig b/mm/Kconfig
index ffc3a2ba3a8c..b924f4a5a3ef 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -45,22 +45,6 @@ config ZSWAP_DEFAULT_ON
The selection made here can be overridden by using the kernel
command line 'zswap.enabled=' option.
-config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
- bool "Invalidate zswap entries when pages are loaded"
- depends on ZSWAP
- help
- If selected, exclusive loads for zswap will be enabled at boot,
- otherwise it will be disabled.
-
- If exclusive loads are enabled, when a page is loaded from zswap,
- the zswap entry is invalidated at once, as opposed to leaving it
- in zswap until the swap entry is freed.
-
- This avoids having two copies of the same page in memory
- (compressed and uncompressed) after faulting in a page from zswap.
- The cost is that if the page was never dirtied and needs to be
- swapped out again, it will be re-compressed.
-
config ZSWAP_SHRINKER_DEFAULT_ON
bool "Shrink the zswap pool on memory pressure"
depends on ZSWAP
@@ -901,15 +885,6 @@ config CMA
If unsure, say "n".
-config CMA_DEBUG
- bool "CMA debug messages (DEVELOPMENT)"
- depends on DEBUG_KERNEL && CMA
- help
- Turns on debug messages in CMA. This produces KERN_DEBUG
- messages for every CMA call as well as various messages while
- processing calls such as dma_alloc_from_contiguous().
- This option does not affect warning and error messages.
-
config CMA_DEBUGFS
bool "CMA debugfs interface"
depends on CMA && DEBUG_FS
@@ -926,14 +901,14 @@ config CMA_SYSFS
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
- default 19 if NUMA
- default 7
+ default 20 if NUMA
+ default 8
help
CMA allows to create CMA areas for particular purpose, mainly,
used as device private area. This parameter sets the maximum
number of CMA area in the system.
- If unsure, leave the default value "7" in UMA and "19" in NUMA.
+ If unsure, leave the default value "8" in UMA and "20" in NUMA.
config MEM_SOFT_DIRTY
bool "Track memory changes"
@@ -998,6 +973,12 @@ config IDLE_PAGE_TRACKING
See Documentation/admin-guide/mm/idle_page_tracking.rst for
more details.
+# Architectures which implement cpu_dcache_is_aliasing() to query
+# whether the data caches are aliased (VIVT or VIPT with dcache
+# aliasing) need to select this.
+config ARCH_HAS_CPU_CACHE_ALIASING
+ bool
+
config ARCH_HAS_CACHE_LINE_SIZE
bool
diff --git a/mm/cma.c b/mm/cma.c
index 7c09c47e530b..01f5a8f71ddf 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -14,11 +14,6 @@
#define pr_fmt(fmt) "cma: " fmt
-#ifdef CONFIG_CMA_DEBUG
-#ifndef DEBUG
-# define DEBUG
-#endif
-#endif
#define CREATE_TRACE_POINTS
#include <linux/memblock.h>
@@ -387,7 +382,6 @@ err:
return ret;
}
-#ifdef CONFIG_CMA_DEBUG
static void cma_debug_show_areas(struct cma *cma)
{
unsigned long next_zero_bit, next_set_bit, nr_zero;
@@ -412,9 +406,6 @@ static void cma_debug_show_areas(struct cma *cma)
pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
spin_unlock_irq(&cma->lock);
}
-#else
-static inline void cma_debug_show_areas(struct cma *cma) { }
-#endif
/**
* cma_alloc() - allocate pages from contiguous area
@@ -436,17 +427,18 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
unsigned long i;
struct page *page = NULL;
int ret = -ENOMEM;
+ const char *name = cma ? cma->name : NULL;
+
+ trace_cma_alloc_start(name, count, align);
if (!cma || !cma->count || !cma->bitmap)
- goto out;
+ return page;
pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
(void *)cma, cma->name, count, align);
if (!count)
- goto out;
-
- trace_cma_alloc_start(cma->name, count, align);
+ return page;
mask = cma_bitmap_aligned_mask(cma, align);
offset = cma_bitmap_aligned_offset(cma, align);
@@ -454,7 +446,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
if (bitmap_count > bitmap_maxno)
- goto out;
+ return page;
for (;;) {
spin_lock_irq(&cma->lock);
@@ -496,8 +488,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
start = bitmap_no + mask + 1;
}
- trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret);
-
/*
* CMA can allocate multiple page blocks, which results in different
* blocks being marked with different tags. Reset the tags to ignore
@@ -515,14 +505,13 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
}
pr_debug("%s(): returned %p\n", __func__, page);
-out:
+ trace_cma_alloc_finish(name, pfn, page, count, align, ret);
if (page) {
count_vm_event(CMA_ALLOC_SUCCESS);
cma_sysfs_account_success_pages(cma, count);
} else {
count_vm_event(CMA_ALLOC_FAIL);
- if (cma)
- cma_sysfs_account_fail_pages(cma, count);
+ cma_sysfs_account_fail_pages(cma, count);
}
return page;
@@ -573,6 +562,7 @@ bool cma_release(struct cma *cma, const struct page *pages,
free_contig_range(pfn, count);
cma_clear_bitmap(cma, pfn, count);
+ cma_sysfs_account_release_pages(cma, count);
trace_cma_release(cma->name, pfn, pages, count);
return true;
diff --git a/mm/cma.h b/mm/cma.h
index 88a0595670b7..ad61cc6dd439 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -27,6 +27,8 @@ struct cma {
atomic64_t nr_pages_succeeded;
/* the number of CMA page allocation failures */
atomic64_t nr_pages_failed;
+ /* the number of CMA page released */
+ atomic64_t nr_pages_released;
/* kobject requires dynamic object */
struct cma_kobject *cma_kobj;
#endif
@@ -44,10 +46,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
#ifdef CONFIG_CMA_SYSFS
void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
+void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages);
#else
static inline void cma_sysfs_account_success_pages(struct cma *cma,
unsigned long nr_pages) {};
static inline void cma_sysfs_account_fail_pages(struct cma *cma,
unsigned long nr_pages) {};
+static inline void cma_sysfs_account_release_pages(struct cma *cma,
+ unsigned long nr_pages) {};
#endif
#endif
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
index 56347d15b7e8..f50db3973171 100644
--- a/mm/cma_sysfs.c
+++ b/mm/cma_sysfs.c
@@ -24,6 +24,11 @@ void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
atomic64_add(nr_pages, &cma->nr_pages_failed);
}
+void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages)
+{
+ atomic64_add(nr_pages, &cma->nr_pages_released);
+}
+
static inline struct cma *cma_from_kobj(struct kobject *kobj)
{
return container_of(kobj, struct cma_kobject, kobj)->cma;
@@ -48,6 +53,15 @@ static ssize_t alloc_pages_fail_show(struct kobject *kobj,
}
CMA_ATTR_RO(alloc_pages_fail);
+static ssize_t release_pages_success_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_released));
+}
+CMA_ATTR_RO(release_pages_success);
+
static void cma_kobj_release(struct kobject *kobj)
{
struct cma *cma = cma_from_kobj(kobj);
@@ -60,6 +74,7 @@ static void cma_kobj_release(struct kobject *kobj)
static struct attribute *cma_attrs[] = {
&alloc_pages_success_attr.attr,
&alloc_pages_fail_attr.attr,
+ &release_pages_success_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(cma);
diff --git a/mm/compaction.c b/mm/compaction.c
index 4add68d40e8d..f146478b01bc 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1365,12 +1365,14 @@ static bool suitable_migration_target(struct compact_control *cc,
{
/* If the page is a large free page, then disallow migration */
if (PageBuddy(page)) {
+ int order = cc->order > 0 ? cc->order : pageblock_order;
+
/*
* We are checking page_order without zone->lock taken. But
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
- if (buddy_order_unsafe(page) >= pageblock_order)
+ if (buddy_order_unsafe(page) >= order)
return false;
}
@@ -1796,6 +1798,7 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
dst = list_entry(cc->freepages.next, struct folio, lru);
list_del(&dst->lru);
cc->nr_freepages--;
+ cc->nr_migratepages--;
return dst;
}
@@ -1811,6 +1814,7 @@ static void compaction_free(struct folio *dst, unsigned long data)
list_add(&dst->lru, &cc->freepages);
cc->nr_freepages++;
+ cc->nr_migratepages++;
}
/* possible outcome of isolate_migratepages */
@@ -2433,7 +2437,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
- unsigned int nr_succeeded = 0;
+ unsigned int nr_succeeded = 0, nr_migratepages;
/*
* These counters track activities during zone compaction. Initialize
@@ -2551,11 +2555,17 @@ rescan:
pageblock_start_pfn(cc->migrate_pfn - 1));
}
+ /*
+ * Record the number of pages to migrate since the
+ * compaction_alloc/free() will update cc->nr_migratepages
+ * properly.
+ */
+ nr_migratepages = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION, &nr_succeeded);
- trace_mm_compaction_migratepages(cc, nr_succeeded);
+ trace_mm_compaction_migratepages(nr_migratepages, nr_succeeded);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
@@ -2788,25 +2798,27 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
}
/*
- * Compact all zones within a node till each zone's fragmentation score
- * reaches within proactive compaction thresholds (as determined by the
- * proactiveness tunable).
+ * compact_node() - compact all zones within a node
+ * @pgdat: The node page data
+ * @proactive: Whether the compaction is proactive
*
- * It is possible that the function returns before reaching score targets
- * due to various back-off conditions, such as, contention on per-node or
- * per-zone locks.
+ * For proactive compaction, compact till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable), it is possible that the function returns before
+ * reaching score targets due to various back-off conditions, such as,
+ * contention on per-node or per-zone locks.
*/
-static void proactive_compact_node(pg_data_t *pgdat)
+static void compact_node(pg_data_t *pgdat, bool proactive)
{
int zoneid;
struct zone *zone;
struct compact_control cc = {
.order = -1,
- .mode = MIGRATE_SYNC_LIGHT,
+ .mode = proactive ? MIGRATE_SYNC_LIGHT : MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
.gfp_mask = GFP_KERNEL,
- .proactive_compaction = true,
+ .proactive_compaction = proactive,
};
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
@@ -2818,41 +2830,16 @@ static void proactive_compact_node(pg_data_t *pgdat)
compact_zone(&cc, NULL);
- count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
- cc.total_migrate_scanned);
- count_compact_events(KCOMPACTD_FREE_SCANNED,
- cc.total_free_scanned);
- }
-}
-
-/* Compact all zones within a node */
-static void compact_node(int nid)
-{
- pg_data_t *pgdat = NODE_DATA(nid);
- int zoneid;
- struct zone *zone;
- struct compact_control cc = {
- .order = -1,
- .mode = MIGRATE_SYNC,
- .ignore_skip_hint = true,
- .whole_zone = true,
- .gfp_mask = GFP_KERNEL,
- };
-
-
- for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
-
- zone = &pgdat->node_zones[zoneid];
- if (!populated_zone(zone))
- continue;
-
- cc.zone = zone;
-
- compact_zone(&cc, NULL);
+ if (proactive) {
+ count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+ cc.total_migrate_scanned);
+ count_compact_events(KCOMPACTD_FREE_SCANNED,
+ cc.total_free_scanned);
+ }
}
}
-/* Compact all nodes in the system */
+/* Compact all zones of all nodes in the system */
static void compact_nodes(void)
{
int nid;
@@ -2861,7 +2848,7 @@ static void compact_nodes(void)
lru_add_drain_all();
for_each_online_node(nid)
- compact_node(nid);
+ compact_node(NODE_DATA(nid), false);
}
static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
@@ -2923,7 +2910,7 @@ static ssize_t compact_store(struct device *dev,
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
- compact_node(nid);
+ compact_node(NODE_DATA(nid), false);
}
return count;
@@ -3132,7 +3119,7 @@ static int kcompactd(void *p)
unsigned int prev_score, score;
prev_score = fragmentation_score_node(pgdat);
- proactive_compact_node(pgdat);
+ compact_node(pgdat, true);
score = fragmentation_score_node(pgdat);
/*
* Defer proactive compaction if the fragmentation
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 29f43fbc2eff..fecb8172410c 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -71,7 +71,7 @@ config DAMON_SYSFS_KUNIT_TEST
If unsure, say N.
-config DAMON_DBGFS
+config DAMON_DBGFS_DEPRECATED
bool "DAMON debugfs interface (DEPRECATED!)"
depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
help
@@ -84,6 +84,11 @@ config DAMON_DBGFS
(DAMON_SYSFS). If you depend on this and cannot move, please report
your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
+config DAMON_DBGFS
+ bool
+ default y
+ depends on DAMON_DBGFS_DEPRECATED
+
config DAMON_DBGFS_KUNIT_TEST
bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
depends on DAMON_DBGFS && KUNIT=y
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 7dac24e69e3b..2461cfe2e968 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -15,6 +15,11 @@
#include <linux/page_idle.h>
#include <linux/slab.h>
+#define DAMON_DBGFS_DEPRECATION_NOTICE \
+ "DAMON debugfs interface is deprecated, so users should move " \
+ "to DAMON_SYSFS. If you cannot, please report your usecase to " \
+ "damon@lists.linux.dev and linux-mm@kvack.org.\n"
+
static struct damon_ctx **dbgfs_ctxs;
static int dbgfs_nr_ctxs;
static struct dentry **dbgfs_dirs;
@@ -22,10 +27,7 @@ static DEFINE_MUTEX(damon_dbgfs_lock);
static void damon_dbgfs_warn_deprecation(void)
{
- pr_warn_once("DAMON debugfs interface is deprecated, "
- "so users should move to DAMON_SYSFS. If you cannot, "
- "please report your usecase to damon@lists.linux.dev and "
- "linux-mm@kvack.org.\n");
+ pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE);
}
/*
@@ -805,6 +807,14 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
damon_destroy_ctx(ctx);
}
+static ssize_t damon_dbgfs_deprecated_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
+
+ return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf));
+}
+
/*
* Make a context of @name and create a debugfs directory for it.
*
@@ -1056,6 +1066,10 @@ static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
return nonseekable_open(inode, file);
}
+static const struct file_operations deprecated_fops = {
+ .read = damon_dbgfs_deprecated_read,
+};
+
static const struct file_operations mk_contexts_fops = {
.open = damon_dbgfs_static_file_open,
.write = dbgfs_mk_context_write,
@@ -1076,9 +1090,9 @@ static int __init __damon_dbgfs_init(void)
{
struct dentry *dbgfs_root;
const char * const file_names[] = {"mk_contexts", "rm_contexts",
- "monitor_on"};
+ "monitor_on_DEPRECATED", "DEPRECATED"};
const struct file_operations *fops[] = {&mk_contexts_fops,
- &rm_contexts_fops, &monitor_on_fops};
+ &rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
int i;
dbgfs_root = debugfs_create_dir("damon", NULL);
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 4c37a166eb81..ec0703e1e90b 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start(
struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx, bool total_bytes_only);
+void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx);
+
bool damos_sysfs_regions_upd_done(void);
int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index ae0f0b314f3a..f6c7f43f06cc 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -127,17 +127,17 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = {
*
* Once the tried regions update request is received, the request handling
* start function (damon_sysfs_scheme_update_regions_start()) sets the status
- * of all schemes as 'idle' again, and register ->before_damos_apply() and
- * ->after_sampling() callbacks.
+ * of all schemes as 'idle' again, and register ->before_damos_apply()
+ * callback.
*
* Then, the first followup ->before_damos_apply() callback
* (damon_sysfs_before_damos_apply()) sets the status 'started'. The first
- * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call
- * is called only after the scheme is completely applied
- * to the given snapshot. Hence the callback knows the situation by showing
- * 'started' status, and sets the status as 'finished'. Then,
- * damon_sysfs_before_damos_apply() understands the situation by showing the
- * 'finished' status and do nothing.
+ * ->after_sampling() or ->after_aggregation() callback
+ * (damon_sysfs_cmd_request_callback()) after the call is called only after
+ * the scheme is completely applied to the given snapshot. Hence the callback
+ * knows the situation by showing 'started' status, and sets the status as
+ * 'finished'. Then, damon_sysfs_before_damos_apply() understands the
+ * situation by showing the 'finished' status and do nothing.
*
* If DAMOS is not applied to any region due to any reasons including the
* access pattern, the watermarks, the quotas, and the filters,
@@ -2122,7 +2122,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
* callback is registered, damon_sysfs_lock should be held to ensure the
* regions directories exist.
*/
-static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
+void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx)
{
struct damon_sysfs_schemes *sysfs_schemes =
damon_sysfs_schemes_for_damos_callback;
@@ -2138,8 +2138,6 @@ static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
sysfs_regions->upd_status =
DAMOS_TRIED_REGIONS_UPD_FINISHED;
}
-
- return 0;
}
/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
@@ -2212,7 +2210,6 @@ int damon_sysfs_schemes_update_regions_start(
damos_tried_regions_init_upd_status(sysfs_schemes, ctx);
damos_regions_upd_total_bytes_only = total_bytes_only;
ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
- ctx->callback.after_sampling = damon_sysfs_after_sampling;
return 0;
}
@@ -2241,7 +2238,6 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
{
damon_sysfs_schemes_for_damos_callback = NULL;
ctx->callback.before_damos_apply = NULL;
- ctx->callback.after_sampling = NULL;
damon_sysfs_schemes_region_idx = 0;
return 0;
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1f891e18b4ee..678de97fcc88 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1379,11 +1379,13 @@ static int damon_sysfs_commit_schemes_quota_goals(
* damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
* @c: The DAMON context of the callback.
* @active: Whether @c is not deactivated due to watermarks.
+ * @after_aggr: Whether this is called from after_aggregation() callback.
*
* This function is periodically called back from the kdamond thread for @c.
* Then, it checks if there is a waiting DAMON sysfs request and handles it.
*/
-static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
+static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active,
+ bool after_aggregation)
{
struct damon_sysfs_kdamond *kdamond;
bool total_bytes_only = false;
@@ -1401,6 +1403,8 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
err = damon_sysfs_upd_schemes_stats(kdamond);
break;
case DAMON_SYSFS_CMD_COMMIT:
+ if (!after_aggregation)
+ goto out;
err = damon_sysfs_commit_input(kdamond);
break;
case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
@@ -1418,6 +1422,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
goto keep_lock_out;
}
} else {
+ damos_sysfs_mark_finished_regions_updates(c);
/*
* Continue regions updating if DAMON is till
* active and the update for all schemes is not
@@ -1450,7 +1455,16 @@ static int damon_sysfs_after_wmarks_check(struct damon_ctx *c)
* after_wmarks_check() is called back while the context is deactivated
* by watermarks.
*/
- return damon_sysfs_cmd_request_callback(c, false);
+ return damon_sysfs_cmd_request_callback(c, false, false);
+}
+
+static int damon_sysfs_after_sampling(struct damon_ctx *c)
+{
+ /*
+ * after_sampling() is called back only while the context is not
+ * deactivated by watermarks.
+ */
+ return damon_sysfs_cmd_request_callback(c, true, false);
}
static int damon_sysfs_after_aggregation(struct damon_ctx *c)
@@ -1459,7 +1473,7 @@ static int damon_sysfs_after_aggregation(struct damon_ctx *c)
* after_aggregation() is called back only while the context is not
* deactivated by watermarks.
*/
- return damon_sysfs_cmd_request_callback(c, true);
+ return damon_sysfs_cmd_request_callback(c, true, true);
}
static struct damon_ctx *damon_sysfs_build_ctx(
@@ -1478,6 +1492,7 @@ static struct damon_ctx *damon_sysfs_build_ctx(
}
ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check;
+ ctx->callback.after_sampling = damon_sysfs_after_sampling;
ctx->callback.after_aggregation = damon_sysfs_after_aggregation;
ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
diff --git a/mm/filemap.c b/mm/filemap.c
index 4a30de98a8c7..b7a21551fbc7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -843,7 +843,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, index);
- int huge = folio_test_hugetlb(folio);
+ bool huge = folio_test_hugetlb(folio);
bool charged = false;
long nr = 1;
@@ -1354,7 +1354,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
unsigned long pflags;
bool in_thrashing;
wait_queue_head_t *q;
- struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+ struct folio *folio = pfn_swap_entry_folio(entry);
q = folio_waitqueue(folio);
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94c958f7ebb5..28341a5067fb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1905,12 +1905,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else {
- struct page *page = NULL;
+ struct folio *folio = NULL;
int flush_needed = 1;
if (pmd_present(orig_pmd)) {
- page = pmd_page(orig_pmd);
- folio_remove_rmap_pmd(page_folio(page), page, vma);
+ struct page *page = pmd_page(orig_pmd);
+
+ folio = page_folio(page);
+ folio_remove_rmap_pmd(folio, page, vma);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
@@ -1918,23 +1920,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = pfn_swap_entry_to_page(entry);
+ folio = pfn_swap_entry_folio(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
- add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
+ add_mm_counter(tlb->mm, mm_counter_file(folio),
+ -HPAGE_PMD_NR);
}
spin_unlock(ptl);
if (flush_needed)
- tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
+ tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
}
return 1;
}
@@ -2045,7 +2048,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+ struct folio *folio = pfn_swap_entry_folio(entry);
pmd_t newpmd;
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
@@ -2155,7 +2158,7 @@ unlock:
#ifdef CONFIG_USERFAULTFD
/*
- * The PT lock for src_pmd and the mmap_lock for reading are held by
+ * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
* the caller, but it must return after releasing the page_table_lock.
* Just move the page from src_pmd to dst_pmd if possible.
* Return zero if succeeded in moving the page, -EAGAIN if it needs to be
@@ -2178,7 +2181,8 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
src_ptl = pmd_lockptr(mm, src_pmd);
lockdep_assert_held(src_ptl);
- mmap_assert_locked(mm);
+ vma_assert_locked(src_vma);
+ vma_assert_locked(dst_vma);
/* Sanity checks before the operation */
if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
@@ -2197,13 +2201,18 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
}
src_page = pmd_page(src_pmdval);
- if (unlikely(!PageAnonExclusive(src_page))) {
- spin_unlock(src_ptl);
- return -EBUSY;
- }
- src_folio = page_folio(src_page);
- folio_get(src_folio);
+ if (!is_huge_zero_pmd(src_pmdval)) {
+ if (unlikely(!PageAnonExclusive(src_page))) {
+ spin_unlock(src_ptl);
+ return -EBUSY;
+ }
+
+ src_folio = page_folio(src_page);
+ folio_get(src_folio);
+ } else
+ src_folio = NULL;
+
spin_unlock(src_ptl);
flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
@@ -2211,19 +2220,22 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
src_addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
- folio_lock(src_folio);
+ if (src_folio) {
+ folio_lock(src_folio);
- /*
- * split_huge_page walks the anon_vma chain without the page
- * lock. Serialize against it with the anon_vma lock, the page
- * lock is not enough.
- */
- src_anon_vma = folio_get_anon_vma(src_folio);
- if (!src_anon_vma) {
- err = -EAGAIN;
- goto unlock_folio;
- }
- anon_vma_lock_write(src_anon_vma);
+ /*
+ * split_huge_page walks the anon_vma chain without the page
+ * lock. Serialize against it with the anon_vma lock, the page
+ * lock is not enough.
+ */
+ src_anon_vma = folio_get_anon_vma(src_folio);
+ if (!src_anon_vma) {
+ err = -EAGAIN;
+ goto unlock_folio;
+ }
+ anon_vma_lock_write(src_anon_vma);
+ } else
+ src_anon_vma = NULL;
dst_ptl = pmd_lockptr(mm, dst_pmd);
double_pt_lock(src_ptl, dst_ptl);
@@ -2232,45 +2244,54 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
err = -EAGAIN;
goto unlock_ptls;
}
- if (folio_maybe_dma_pinned(src_folio) ||
- !PageAnonExclusive(&src_folio->page)) {
- err = -EBUSY;
- goto unlock_ptls;
- }
+ if (src_folio) {
+ if (folio_maybe_dma_pinned(src_folio) ||
+ !PageAnonExclusive(&src_folio->page)) {
+ err = -EBUSY;
+ goto unlock_ptls;
+ }
- if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
- WARN_ON_ONCE(!folio_test_anon(src_folio))) {
- err = -EBUSY;
- goto unlock_ptls;
- }
+ if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
+ WARN_ON_ONCE(!folio_test_anon(src_folio))) {
+ err = -EBUSY;
+ goto unlock_ptls;
+ }
- folio_move_anon_rmap(src_folio, dst_vma);
- WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
+ folio_move_anon_rmap(src_folio, dst_vma);
+ WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
- src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
- /* Folio got pinned from under us. Put it back and fail the move. */
- if (folio_maybe_dma_pinned(src_folio)) {
- set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
- err = -EBUSY;
- goto unlock_ptls;
- }
+ src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
+ /* Folio got pinned from under us. Put it back and fail the move. */
+ if (folio_maybe_dma_pinned(src_folio)) {
+ set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
+ err = -EBUSY;
+ goto unlock_ptls;
+ }
- _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
+ _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+ /* Follow mremap() behavior and treat the entry dirty after the move */
+ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
+ } else {
+ src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
+ _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
+ }
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
double_pt_unlock(src_ptl, dst_ptl);
- anon_vma_unlock_write(src_anon_vma);
- put_anon_vma(src_anon_vma);
+ if (src_anon_vma) {
+ anon_vma_unlock_write(src_anon_vma);
+ put_anon_vma(src_anon_vma);
+ }
unlock_folio:
/* unblock rmap walks */
- folio_unlock(src_folio);
+ if (src_folio)
+ folio_unlock(src_folio);
mmu_notifier_invalidate_range_end(&range);
- folio_put(src_folio);
+ if (src_folio)
+ folio_put(src_folio);
return err;
}
#endif /* CONFIG_USERFAULTFD */
@@ -2442,7 +2463,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
+ folio = pfn_swap_entry_folio(entry);
} else {
page = pmd_page(old_pmd);
folio = page_folio(page);
@@ -2453,7 +2474,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
folio_remove_rmap_pmd(folio, page, vma);
folio_put(folio);
}
- add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
+ add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
return;
}
@@ -2559,15 +2580,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte);
- for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry;
- /*
- * Note that NUMA hinting access restrictions are not
- * transferred to avoid any possibility of altering
- * permissions across VMAs.
- */
- if (freeze || pmd_migration) {
+
+ /*
+ * Note that NUMA hinting access restrictions are not transferred to
+ * avoid any possibility of altering permissions across VMAs.
+ */
+ if (freeze || pmd_migration) {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t entry;
swp_entry_t swp_entry;
+
if (write)
swp_entry = make_writable_migration_entry(
page_to_pfn(page + i));
@@ -2586,25 +2608,32 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_swp_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
- } else {
- entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
- if (write)
- entry = pte_mkwrite(entry, vma);
- if (!young)
- entry = pte_mkold(entry);
- /* NOTE: this may set soft-dirty too on some archs */
- if (dirty)
- entry = pte_mkdirty(entry);
- if (soft_dirty)
- entry = pte_mksoft_dirty(entry);
- if (uffd_wp)
- entry = pte_mkuffd_wp(entry);
+
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+ set_pte_at(mm, addr, pte + i, entry);
}
- VM_BUG_ON(!pte_none(ptep_get(pte)));
- set_pte_at(mm, addr, pte, entry);
- pte++;
+ } else {
+ pte_t entry;
+
+ entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
+ if (write)
+ entry = pte_mkwrite(entry, vma);
+ if (!young)
+ entry = pte_mkold(entry);
+ /* NOTE: this may set soft-dirty too on some archs */
+ if (dirty)
+ entry = pte_mkdirty(entry);
+ if (soft_dirty)
+ entry = pte_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+
+ set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
}
- pte_unmap(pte - 1);
+ pte_unmap(pte);
if (!pmd_migration)
folio_remove_rmap_pmd(folio, page, vma);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ed1581b670d4..c53a41d07cd3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3029,21 +3029,9 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nid = folio_nid(old_folio);
- struct folio *new_folio;
+ struct folio *new_folio = NULL;
int ret = 0;
- /*
- * Before dissolving the folio, we need to allocate a new one for the
- * pool to remain stable. Here, we allocate the folio and 'prep' it
- * by doing everything but actually updating counters and adding to
- * the pool. This simplifies and let us do most of the processing
- * under the lock.
- */
- new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
- if (!new_folio)
- return -ENOMEM;
- __prep_new_hugetlb_folio(h, new_folio);
-
retry:
spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(old_folio)) {
@@ -3073,6 +3061,16 @@ retry:
cond_resched();
goto retry;
} else {
+ if (!new_folio) {
+ spin_unlock_irq(&hugetlb_lock);
+ new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
+ NULL, NULL);
+ if (!new_folio)
+ return -ENOMEM;
+ __prep_new_hugetlb_folio(h, new_folio);
+ goto retry;
+ }
+
/*
* Ok, old_folio is still a genuine free hugepage. Remove it from
* the freelist and decrease the counters. These will be
@@ -3100,9 +3098,11 @@ retry:
free_new:
spin_unlock_irq(&hugetlb_lock);
- /* Folio has a zero ref count, but needs a ref to be freed */
- folio_ref_unfreeze(new_folio, 1);
- update_and_free_hugetlb_folio(h, new_folio, false);
+ if (new_folio) {
+ /* Folio has a zero ref count, but needs a ref to be freed */
+ folio_ref_unfreeze(new_folio, 1);
+ update_and_free_hugetlb_folio(h, new_folio, false);
+ }
return ret;
}
@@ -5585,6 +5585,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ bool adjust_reservation = false;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5677,7 +5678,31 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
hugetlb_count_sub(pages_per_huge_page(h), mm);
hugetlb_remove_rmap(page_folio(page));
+ /*
+ * Restore the reservation for anonymous page, otherwise the
+ * backing page could be stolen by someone.
+ * If there we are freeing a surplus, do not set the restore
+ * reservation bit.
+ */
+ if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
+ folio_test_anon(page_folio(page))) {
+ folio_set_hugetlb_restore_reserve(page_folio(page));
+ /* Reservation to be adjusted after the spin lock */
+ adjust_reservation = true;
+ }
+
spin_unlock(ptl);
+
+ /*
+ * Adjust the reservation for the region that will have the
+ * reserve restored. Keep in mind that vma_needs_reservation() changes
+ * resv->adds_in_progress if it succeeds. If this is not done,
+ * do_exit() will not see it, and will keep the reservation
+ * forever.
+ */
+ if (adjust_reservation && vma_needs_reservation(h, vma, address))
+ vma_add_reservation(h, vma, address);
+
tlb_remove_page_size(tlb, page, huge_page_size(h));
/*
* Bail out after unmapping reference page if supplied
@@ -7695,6 +7720,13 @@ void __init hugetlb_cma_reserve(int order)
bool node_specific_cma_alloc = false;
int nid;
+ /*
+ * HugeTLB CMA reservation is required for gigantic
+ * huge pages which could not be allocated via the
+ * page allocator. Just warn if there is any change
+ * breaking this assumption.
+ */
+ VM_WARN_ON(order <= MAX_PAGE_ORDER);
cma_reserve_called = true;
if (!hugetlb_cma_size)
diff --git a/mm/internal.h b/mm/internal.h
index f309a010d50f..1e29c5821a1d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1114,6 +1114,15 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
extern bool mirrored_kernelcore;
extern bool memblock_has_mirror(void);
+static __always_inline void vma_set_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff)
+{
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+}
+
static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
/*
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6ca63e8dda74..e7c9a4dc89f8 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -55,7 +55,7 @@ void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack)
u64 ts_nsec = local_clock();
track->cpu = cpu;
- track->timestamp = ts_nsec >> 3;
+ track->timestamp = ts_nsec >> 9;
#endif /* CONFIG_KASAN_EXTRA_INFO */
track->pid = current->pid;
track->stack = stack;
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 971cfff4ca0b..2d8ae4fbe63b 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -440,7 +440,8 @@ static void kmalloc_oob_16(struct kunit *test)
/* This test is specifically crafted for the generic mode. */
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
- ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+ /* RELOC_HIDE to prevent gcc from warning about short alloc */
+ ptr1 = RELOC_HIDE(kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL), 0);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
@@ -697,6 +698,84 @@ static void kmalloc_uaf3(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]);
}
+static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
+{
+ int *i_unsafe = (int *)unsafe;
+
+ KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_and(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_andnot(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_or(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_xor(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_and(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_andnot(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_or(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xor(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe));
+}
+
+static void kasan_atomics(struct kunit *test)
+{
+ void *a1, *a2;
+
+ /*
+ * Just as with kasan_bitops_tags(), we allocate 48 bytes of memory such
+ * that the following 16 bytes will make up the redzone.
+ */
+ a1 = kzalloc(48, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1);
+ a2 = kzalloc(sizeof(int), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1);
+
+ /* Use atomics to access the redzone. */
+ kasan_atomics_helper(test, a1 + 48, a2);
+
+ kfree(a1);
+ kfree(a2);
+}
+
static void kmalloc_double_kzfree(struct kunit *test)
{
char *ptr;
@@ -1883,6 +1962,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kasan_strings),
KUNIT_CASE(kasan_bitops_generic),
KUNIT_CASE(kasan_bitops_tags),
+ KUNIT_CASE(kasan_atomics),
KUNIT_CASE(vmalloc_helpers_tags),
KUNIT_CASE(vmalloc_oob),
KUNIT_CASE(vmap_tags),
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
index 8b7b3ea2c74e..27ec22767e42 100644
--- a/mm/kasan/kasan_test_module.c
+++ b/mm/kasan/kasan_test_module.c
@@ -62,7 +62,7 @@ static noinline void __init copy_user_test(void)
kfree(kmem);
}
-static int __init test_kasan_module_init(void)
+static int __init kasan_test_module_init(void)
{
/*
* Temporarily enable multi-shot mode. Otherwise, KASAN would only
@@ -77,5 +77,5 @@ static int __init test_kasan_module_init(void)
return -EAGAIN;
}
-module_init(test_kasan_module_init);
+module_init(kasan_test_module_init);
MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 7afa4feb03e1..b48c768acc84 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -267,7 +267,7 @@ static void print_track(struct kasan_track *track, const char *prefix)
u64 ts_nsec = track->timestamp;
unsigned long rem_usec;
- ts_nsec <<= 3;
+ ts_nsec <<= 9;
rem_usec = do_div(ts_nsec, NSEC_PER_SEC) / 1000;
pr_err("%s by task %u on cpu %d at %lu.%06lus:\n",
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2b219acb528e..fe43fbc44525 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1634,7 +1634,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 3: set proper refcount and mm_counters. */
if (nr_ptes) {
folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
}
/* step 4: remove empty page table */
@@ -1665,7 +1665,7 @@ abort:
if (nr_ptes) {
flush_tlb_mm(mm);
folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
}
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 5d6e2dee5692..0b09daa188ef 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -359,6 +359,12 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
}
/* Functions from kmsan-checks.h follow. */
+
+/*
+ * To create an origin, kmsan_poison_memory() unwinds the stacks and stores it
+ * into the stack depot. This may cause deadlocks if done from within KMSAN
+ * runtime, therefore we bail out if kmsan_in_runtime().
+ */
void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
{
if (!kmsan_enabled || kmsan_in_runtime())
@@ -371,47 +377,31 @@ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
}
EXPORT_SYMBOL(kmsan_poison_memory);
+/*
+ * Unlike kmsan_poison_memory(), this function can be used from within KMSAN
+ * runtime, because it does not trigger allocations or call instrumented code.
+ */
void kmsan_unpoison_memory(const void *address, size_t size)
{
unsigned long ua_flags;
- if (!kmsan_enabled || kmsan_in_runtime())
+ if (!kmsan_enabled)
return;
ua_flags = user_access_save();
- kmsan_enter_runtime();
/* The users may want to poison/unpoison random memory. */
kmsan_internal_unpoison_memory((void *)address, size,
KMSAN_POISON_NOCHECK);
- kmsan_leave_runtime();
user_access_restore(ua_flags);
}
EXPORT_SYMBOL(kmsan_unpoison_memory);
/*
- * Version of kmsan_unpoison_memory() that can be called from within the KMSAN
- * runtime.
- *
- * Non-instrumented IRQ entry functions receive struct pt_regs from assembly
- * code. Those regs need to be unpoisoned, otherwise using them will result in
- * false positives.
- * Using kmsan_unpoison_memory() is not an option in entry code, because the
- * return value of in_task() is inconsistent - as a result, certain calls to
- * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that
- * the registers are unpoisoned even if kmsan_in_runtime() is true in the early
- * entry code.
+ * Version of kmsan_unpoison_memory() called from IRQ entry functions.
*/
void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
{
- unsigned long ua_flags;
-
- if (!kmsan_enabled)
- return;
-
- ua_flags = user_access_save();
- kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs),
- KMSAN_POISON_NOCHECK);
- user_access_restore(ua_flags);
+ kmsan_unpoison_memory((void *)regs, sizeof(*regs));
}
void kmsan_check_memory(const void *addr, size_t size)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 35b0147542a9..3fd64736bc45 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -162,20 +162,6 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);
-void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg)
-{
- struct list_lru_one *list =
- list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
-
- if (list_empty(item)) {
- list_add_tail(item, &list->list);
- if (!list->nr_items++)
- set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
- }
-}
-EXPORT_SYMBOL_GPL(list_lru_putback);
-
unsigned long list_lru_count_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg)
{
@@ -257,6 +243,9 @@ restart:
*/
assert_spin_locked(&nlru->lock);
goto restart;
+ case LRU_STOP:
+ assert_spin_locked(&nlru->lock);
+ goto out;
default:
BUG();
}
@@ -567,6 +556,9 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
lru->shrinker_id = shrinker->id;
else
lru->shrinker_id = -1;
+
+ if (mem_cgroup_kmem_disabled())
+ memcg_aware = false;
#endif
lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 61932c9215e7..cb216d30a221 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4800,7 +4800,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- mem_cgroup_flush_stats(memcg);
+ mem_cgroup_flush_stats_ratelimited(memcg);
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -5621,7 +5621,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (alloc_shrinker_info(memcg))
goto offline_kmem;
- if (unlikely(mem_cgroup_is_root(memcg)))
+ if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
FLUSH_TIME);
lru_gen_online_memcg(memcg);
@@ -5873,7 +5873,7 @@ static int mem_cgroup_do_precharge(unsigned long count)
}
union mc_target {
- struct page *page;
+ struct folio *folio;
swp_entry_t ent;
};
@@ -5965,23 +5965,22 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
}
/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
+ * mem_cgroup_move_account - move account of the folio
+ * @folio: The folio.
* @compound: charge the page as compound or small page
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
+ * @from: mem_cgroup which the folio is moved from.
+ * @to: mem_cgroup which the folio is moved to. @from != @to.
*
- * The page must be locked and not on the LRU.
+ * The folio must be locked and not on the LRU.
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
*/
-static int mem_cgroup_move_account(struct page *page,
+static int mem_cgroup_move_account(struct folio *folio,
bool compound,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
- struct folio *folio = page_folio(page);
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
@@ -6096,7 +6095,7 @@ out:
* Return:
* * MC_TARGET_NONE - If the pte is not a target for move charge.
* * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
- * move charge. If @target is not NULL, the page is stored in target->page
+ * move charge. If @target is not NULL, the folio is stored in target->folio
* with extra refcnt taken (Caller should release it).
* * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
* target for charge migration. If @target is not NULL, the entry is
@@ -6110,6 +6109,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
+ struct folio *folio;
enum mc_target_type ret = MC_TARGET_NONE;
swp_entry_t ent = { .val = 0 };
@@ -6124,9 +6124,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
+ if (page)
+ folio = page_folio(page);
if (target && page) {
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return ret;
}
/*
@@ -6141,8 +6143,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* Alas, skip moving the page in this case.
*/
if (!pte_present(ptent) && page_mapped(page)) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
}
@@ -6155,18 +6157,18 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
- if (page_memcg(page) == mc.from) {
+ if (folio_memcg(folio) == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page) ||
- is_device_coherent_page(page))
+ if (folio_is_device_private(folio) ||
+ folio_is_device_coherent(folio))
ret = MC_TARGET_DEVICE;
if (target)
- target->page = page;
+ target->folio = folio;
}
if (!ret || !target) {
if (target)
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
}
/*
@@ -6192,6 +6194,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, union mc_target *target)
{
struct page *page = NULL;
+ struct folio *folio;
enum mc_target_type ret = MC_TARGET_NONE;
if (unlikely(is_swap_pmd(pmd))) {
@@ -6201,17 +6204,18 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
}
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
+ folio = page_folio(page);
if (!(mc.flags & MOVE_ANON))
return ret;
- if (page_memcg(page) == mc.from) {
+ if (folio_memcg(folio) == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ folio_get(folio);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return MC_TARGET_NONE;
}
- target->page = page;
+ target->folio = folio;
}
}
return ret;
@@ -6431,7 +6435,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
spinlock_t *ptl;
enum mc_target_type target_type;
union mc_target target;
- struct page *page;
+ struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -6441,26 +6445,26 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
- page = target.page;
- if (isolate_lru_page(page)) {
- if (!mem_cgroup_move_account(page, true,
+ folio = target.folio;
+ if (folio_isolate_lru(folio)) {
+ if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
- putback_lru_page(page);
+ folio_putback_lru(folio);
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
} else if (target_type == MC_TARGET_DEVICE) {
- page = target.page;
- if (!mem_cgroup_move_account(page, true,
+ folio = target.folio;
+ if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
spin_unlock(ptl);
return 0;
@@ -6483,28 +6487,28 @@ retry:
device = true;
fallthrough;
case MC_TARGET_PAGE:
- page = target.page;
+ folio = target.folio;
/*
* We can have a part of the split pmd here. Moving it
* can be done but it would be too convoluted so simply
* ignore such a partial THP and keep it in original
* memcg. There should be somebody mapping the head.
*/
- if (PageTransCompound(page))
+ if (folio_test_large(folio))
goto put;
- if (!device && !isolate_lru_page(page))
+ if (!device && !folio_isolate_lru(folio))
goto put;
- if (!mem_cgroup_move_account(page, false,
+ if (!mem_cgroup_move_account(folio, false,
mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
if (!device)
- putback_lru_page(page);
+ folio_putback_lru(folio);
put: /* get_mctgt_type() gets & locks the page */
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
break;
case MC_TARGET_SWAP:
ent = target.ent;
@@ -6977,6 +6981,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
while (nr_reclaimed < nr_to_reclaim) {
+ /* Will converge on zero, but reclaim enforces a minimum */
+ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
unsigned long reclaimed;
if (signal_pending(current))
@@ -6991,8 +6997,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
lru_add_drain_all();
reclaimed = try_to_free_mem_cgroup_pages(memcg,
- min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
- GFP_KERNEL, reclaim_options);
+ batch_size, GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 5462d9e3c84c..0537664620e5 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -39,7 +39,7 @@ static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
struct memory_dev_type *default_dram_type;
-static struct bus_type memory_tier_subsys = {
+static const struct bus_type memory_tier_subsys = {
.name = "memory_tiering",
.dev_name = "memory_tier",
};
@@ -359,6 +359,26 @@ static void disable_all_demotion_targets(void)
synchronize_rcu();
}
+static void dump_demotion_targets(void)
+{
+ int node;
+
+ for_each_node_state(node, N_MEMORY) {
+ struct memory_tier *memtier = __node_get_memory_tier(node);
+ nodemask_t preferred = node_demotion[node].preferred;
+
+ if (!memtier)
+ continue;
+
+ if (nodes_empty(preferred))
+ pr_info("Demotion targets for Node %d: null\n", node);
+ else
+ pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n",
+ node, nodemask_pr_args(&preferred),
+ nodemask_pr_args(&memtier->lower_tier_mask));
+ }
+}
+
/*
* Find an automatic demotion target for all memory
* nodes. Failing here is OK. It might just indicate
@@ -443,7 +463,7 @@ static void establish_demotion_targets(void)
* Now build the lower_tier mask for each node collecting node mask from
* all memory tier below it. This allows us to fallback demotion page
* allocation to a set of nodes that is closer the above selected
- * perferred node.
+ * preferred node.
*/
lower_tier = node_states[N_MEMORY];
list_for_each_entry(memtier, &memory_tiers, list) {
@@ -456,6 +476,8 @@ static void establish_demotion_targets(void)
nodes_andnot(lower_tier, lower_tier, tier_nodes);
memtier->lower_tier_mask = lower_tier;
}
+
+ dump_demotion_targets();
}
#else
diff --git a/mm/memory.c b/mm/memory.c
index 0bfc8b007c01..642b4f2be523 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
+ folio = pfn_swap_entry_folio(entry);
- rss[mm_counter(page)]++;
+ rss[mm_counter(folio)]++;
if (!is_readable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
@@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* keep things as they are.
*/
folio_get(folio);
- rss[mm_counter(page)]++;
+ rss[mm_counter(folio)]++;
/* Cannot fail as these pages cannot get pinned. */
folio_try_dup_anon_rmap_pte(folio, page, src_vma);
@@ -930,68 +930,187 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
return 0;
}
+static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
+ pte_t pte, unsigned long addr, int nr)
+{
+ struct mm_struct *src_mm = src_vma->vm_mm;
+
+ /* If it's a COW mapping, write protect it both processes. */
+ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
+ wrprotect_ptes(src_mm, addr, src_pte, nr);
+ pte = pte_wrprotect(pte);
+ }
+
+ /* If it's a shared mapping, mark it clean in the child. */
+ if (src_vma->vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
+ pte = pte_mkold(pte);
+
+ if (!userfaultfd_wp(dst_vma))
+ pte = pte_clear_uffd_wp(pte);
+
+ set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
+}
+
+/* Flags for folio_pte_batch(). */
+typedef int __bitwise fpb_t;
+
+/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
+#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
+
+/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
+#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
+
+static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
+{
+ if (flags & FPB_IGNORE_DIRTY)
+ pte = pte_mkclean(pte);
+ if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
+ pte = pte_clear_soft_dirty(pte);
+ return pte_wrprotect(pte_mkold(pte));
+}
+
+/*
+ * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+ * pages of the same folio.
+ *
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
+ * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
+ * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
+ *
+ * If "any_writable" is set, it will indicate if any other PTE besides the
+ * first (given) PTE is writable.
+ */
+static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
+ pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
+ bool *any_writable)
+{
+ unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
+ const pte_t *end_ptep = start_ptep + max_nr;
+ pte_t expected_pte, *ptep;
+ bool writable;
+ int nr;
+
+ if (any_writable)
+ *any_writable = false;
+
+ VM_WARN_ON_FOLIO(!pte_present(pte), folio);
+
+ nr = pte_batch_hint(start_ptep, pte);
+ expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
+ ptep = start_ptep + nr;
+
+ while (ptep < end_ptep) {
+ pte = ptep_get(ptep);
+ if (any_writable)
+ writable = !!pte_write(pte);
+ pte = __pte_batch_clear_ignored(pte, flags);
+
+ if (!pte_same(pte, expected_pte))
+ break;
+
+ /*
+ * Stop immediately once we reached the end of the folio. In
+ * corner cases the next PFN might fall into a different
+ * folio.
+ */
+ if (pte_pfn(pte) >= folio_end_pfn)
+ break;
+
+ if (any_writable)
+ *any_writable |= writable;
+
+ nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, nr);
+ ptep += nr;
+ }
+
+ return min(ptep - start_ptep, max_nr);
+}
+
/*
- * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
- * is required to copy this pte.
+ * Copy one present PTE, trying to batch-process subsequent PTEs that map
+ * consecutive pages of the same folio by copying them as well.
+ *
+ * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
+ * Otherwise, returns the number of copied PTEs (at least 1).
*/
static inline int
-copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
- pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct folio **prealloc)
+copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
+ int max_nr, int *rss, struct folio **prealloc)
{
- struct mm_struct *src_mm = src_vma->vm_mm;
- unsigned long vm_flags = src_vma->vm_flags;
- pte_t pte = ptep_get(src_pte);
struct page *page;
struct folio *folio;
+ bool any_writable;
+ fpb_t flags = 0;
+ int err, nr;
page = vm_normal_page(src_vma, addr, pte);
- if (page)
- folio = page_folio(page);
- if (page && folio_test_anon(folio)) {
+ if (unlikely(!page))
+ goto copy_pte;
+
+ folio = page_folio(page);
+
+ /*
+ * If we likely have to copy, just don't bother with batching. Make
+ * sure that the common "small folio" case is as fast as possible
+ * by keeping the batching logic separate.
+ */
+ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
+ if (src_vma->vm_flags & VM_SHARED)
+ flags |= FPB_IGNORE_DIRTY;
+ if (!vma_soft_dirty_enabled(src_vma))
+ flags |= FPB_IGNORE_SOFT_DIRTY;
+
+ nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
+ &any_writable);
+ folio_ref_add(folio, nr);
+ if (folio_test_anon(folio)) {
+ if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
+ nr, src_vma))) {
+ folio_ref_sub(folio, nr);
+ return -EAGAIN;
+ }
+ rss[MM_ANONPAGES] += nr;
+ VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+ } else {
+ folio_dup_file_rmap_ptes(folio, page, nr);
+ rss[mm_counter_file(folio)] += nr;
+ }
+ if (any_writable)
+ pte = pte_mkwrite(pte, src_vma);
+ __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
+ addr, nr);
+ return nr;
+ }
+
+ folio_get(folio);
+ if (folio_test_anon(folio)) {
/*
* If this page may have been pinned by the parent process,
* copy the page immediately for the child so that we'll always
* guarantee the pinned page won't be randomly replaced in the
* future.
*/
- folio_get(folio);
if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
/* Page may be pinned, we have to copy. */
folio_put(folio);
- return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, prealloc, page);
+ err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, page);
+ return err ? err : 1;
}
rss[MM_ANONPAGES]++;
- } else if (page) {
- folio_get(folio);
+ VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+ } else {
folio_dup_file_rmap_pte(folio, page);
- rss[mm_counter_file(page)]++;
+ rss[mm_counter_file(folio)]++;
}
- /*
- * If it's a COW mapping, write protect it both
- * in the parent and the child
- */
- if (is_cow_mapping(vm_flags) && pte_write(pte)) {
- ptep_set_wrprotect(src_mm, addr, src_pte);
- pte = pte_wrprotect(pte);
- }
- VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
-
- /*
- * If it's a shared mapping, mark it clean in
- * the child
- */
- if (vm_flags & VM_SHARED)
- pte = pte_mkclean(pte);
- pte = pte_mkold(pte);
-
- if (!userfaultfd_wp(dst_vma))
- pte = pte_clear_uffd_wp(pte);
-
- set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
- return 0;
+copy_pte:
+ __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
+ return 1;
}
static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
@@ -1028,10 +1147,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *src_pte, *dst_pte;
pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
- int progress, ret = 0;
+ int progress, max_nr, ret = 0;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
struct folio *prealloc = NULL;
+ int nr;
again:
progress = 0;
@@ -1062,6 +1182,8 @@ again:
arch_enter_lazy_mmu_mode();
do {
+ nr = 1;
+
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
@@ -1091,6 +1213,8 @@ again:
progress += 8;
continue;
}
+ ptent = ptep_get(src_pte);
+ VM_WARN_ON_ONCE(!pte_present(ptent));
/*
* Device exclusive entry restored, continue by copying
@@ -1098,9 +1222,10 @@ again:
*/
WARN_ON_ONCE(ret != -ENOENT);
}
- /* copy_present_pte() will clear `*prealloc' if consumed */
- ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, &prealloc);
+ /* copy_present_ptes() will clear `*prealloc' if consumed */
+ max_nr = (end - addr) / PAGE_SIZE;
+ ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
+ ptent, addr, max_nr, rss, &prealloc);
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
@@ -1117,8 +1242,10 @@ again:
folio_put(prealloc);
prealloc = NULL;
}
- progress += 8;
- } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ nr = ret;
+ progress += 8 * nr;
+ } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
+ addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_src_pte, src_ptl);
@@ -1139,7 +1266,7 @@ again:
prealloc = folio_prealloc(src_mm, src_vma, addr, false);
if (!prealloc)
return -ENOMEM;
- } else if (ret) {
+ } else if (ret < 0) {
VM_WARN_ON_ONCE(1);
}
@@ -1369,19 +1496,16 @@ static inline bool should_zap_cows(struct zap_details *details)
return details->even_cows;
}
-/* Decides whether we should zap this page with the page pointer specified */
-static inline bool should_zap_page(struct zap_details *details, struct page *page)
+/* Decides whether we should zap this folio with the folio pointer specified */
+static inline bool should_zap_folio(struct zap_details *details,
+ struct folio *folio)
{
- /* If we can make a decision without *page.. */
+ /* If we can make a decision without *folio.. */
if (should_zap_cows(details))
return true;
- /* E.g. the caller passes NULL for the case of a zero page */
- if (!page)
- return true;
-
- /* Otherwise we should only zap non-anon pages */
- return !PageAnon(page);
+ /* Otherwise we should only zap non-anon folios */
+ return !folio_test_anon(folio);
}
static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
@@ -1398,7 +1522,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
*/
static inline void
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte,
+ unsigned long addr, pte_t *pte, int nr,
struct zap_details *details, pte_t pteval)
{
/* Zap on anonymous always means dropping everything */
@@ -1408,7 +1532,111 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
if (zap_drop_file_uffd_wp(details))
return;
- pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ for (;;) {
+ /* the PFN in the PTE is irrelevant. */
+ pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ if (--nr == 0)
+ break;
+ pte++;
+ addr += PAGE_SIZE;
+ }
+}
+
+static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, struct folio *folio,
+ struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
+ unsigned long addr, struct zap_details *details, int *rss,
+ bool *force_flush, bool *force_break)
+{
+ struct mm_struct *mm = tlb->mm;
+ bool delay_rmap = false;
+
+ if (!folio_test_anon(folio)) {
+ ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ if (pte_dirty(ptent)) {
+ folio_mark_dirty(folio);
+ if (tlb_delay_rmap(tlb)) {
+ delay_rmap = true;
+ *force_flush = true;
+ }
+ }
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
+ folio_mark_accessed(folio);
+ rss[mm_counter(folio)] -= nr;
+ } else {
+ /* We don't need up-to-date accessed/dirty bits. */
+ clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ rss[MM_ANONPAGES] -= nr;
+ }
+ /* Checking a single PTE in a batch is sufficient. */
+ arch_check_zapped_pte(vma, ptent);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
+ if (unlikely(userfaultfd_pte_wp(vma, ptent)))
+ zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
+ ptent);
+
+ if (!delay_rmap) {
+ folio_remove_rmap_ptes(folio, page, nr, vma);
+
+ /* Only sanity-check the first page in a batch. */
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ }
+ if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
+ *force_flush = true;
+ *force_break = true;
+ }
+}
+
+/*
+ * Zap or skip at least one present PTE, trying to batch-process subsequent
+ * PTEs that map consecutive pages of the same folio.
+ *
+ * Returns the number of processed (skipped or zapped) PTEs (at least 1).
+ */
+static inline int zap_present_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+ unsigned int max_nr, unsigned long addr,
+ struct zap_details *details, int *rss, bool *force_flush,
+ bool *force_break)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ struct mm_struct *mm = tlb->mm;
+ struct folio *folio;
+ struct page *page;
+ int nr;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page) {
+ /* We don't need up-to-date accessed/dirty bits. */
+ ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+ arch_check_zapped_pte(vma, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ VM_WARN_ON_ONCE(userfaultfd_wp(vma));
+ ksm_might_unmap_zero_page(mm, ptent);
+ return 1;
+ }
+
+ folio = page_folio(page);
+ if (unlikely(!should_zap_folio(details, folio)))
+ return 1;
+
+ /*
+ * Make sure that the common "small folio" case is as fast as possible
+ * by keeping the batching logic separate.
+ */
+ if (unlikely(folio_test_large(folio) && max_nr != 1)) {
+ nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
+ NULL);
+
+ zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
+ addr, details, rss, force_flush,
+ force_break);
+ return nr;
+ }
+ zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
+ details, rss, force_flush, force_break);
+ return 1;
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1416,13 +1644,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
+ bool force_flush = false, force_break = false;
struct mm_struct *mm = tlb->mm;
- int force_flush = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
+ int nr;
tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
@@ -1436,7 +1665,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t ptent = ptep_get(pte);
struct folio *folio;
struct page *page;
+ int max_nr;
+ nr = 1;
if (pte_none(ptent))
continue;
@@ -1444,44 +1675,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
break;
if (pte_present(ptent)) {
- unsigned int delay_rmap;
-
- page = vm_normal_page(vma, addr, ptent);
- if (unlikely(!should_zap_page(details, page)))
- continue;
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- arch_check_zapped_pte(vma, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details,
- ptent);
- if (unlikely(!page)) {
- ksm_might_unmap_zero_page(mm, ptent);
- continue;
- }
-
- folio = page_folio(page);
- delay_rmap = 0;
- if (!folio_test_anon(folio)) {
- if (pte_dirty(ptent)) {
- folio_mark_dirty(folio);
- if (tlb_delay_rmap(tlb)) {
- delay_rmap = 1;
- force_flush = 1;
- }
- }
- if (pte_young(ptent) && likely(vma_has_recency(vma)))
- folio_mark_accessed(folio);
- }
- rss[mm_counter(page)]--;
- if (!delay_rmap) {
- folio_remove_rmap_pte(folio, page, vma);
- if (unlikely(page_mapcount(page) < 0))
- print_bad_pte(vma, addr, ptent, page);
- }
- if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
- force_flush = 1;
- addr += PAGE_SIZE;
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
+ addr, details, rss, &force_flush,
+ &force_break);
+ if (unlikely(force_break)) {
+ addr += nr * PAGE_SIZE;
break;
}
continue;
@@ -1492,7 +1691,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
is_device_exclusive_entry(entry)) {
page = pfn_swap_entry_to_page(entry);
folio = page_folio(page);
- if (unlikely(!should_zap_page(details, page)))
+ if (unlikely(!should_zap_folio(details, folio)))
continue;
/*
* Both device private/exclusive mappings should only
@@ -1501,7 +1700,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
* see zap_install_uffd_wp_if_needed().
*/
WARN_ON_ONCE(!vma_is_anonymous(vma));
- rss[mm_counter(page)]--;
+ rss[mm_counter(folio)]--;
if (is_device_private_entry(entry))
folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
@@ -1513,10 +1712,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
} else if (is_migration_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- if (!should_zap_page(details, page))
+ folio = pfn_swap_entry_folio(entry);
+ if (!should_zap_folio(details, folio))
continue;
- rss[mm_counter(page)]--;
+ rss[mm_counter(folio)]--;
} else if (pte_marker_entry_uffd_wp(entry)) {
/*
* For anon: always drop the marker; for file: only
@@ -1535,8 +1734,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
WARN_ON_ONCE(1);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+ } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
@@ -1870,7 +2069,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
return -EBUSY;
/* Ok, finally just insert the thing.. */
folio_get(folio);
- inc_mm_counter(vma->vm_mm, mm_counter_file(page));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
folio_add_file_rmap_pte(folio, page, vma);
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
@@ -3175,7 +3374,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
if (old_folio) {
if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(&old_folio->page));
+ dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
@@ -4170,8 +4369,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages)
static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct vm_area_struct *vma = vmf->vma;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
unsigned long orders;
struct folio *folio;
unsigned long addr;
@@ -4223,15 +4422,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
folio = vma_alloc_folio(gfp, order, vma, addr, true);
if (folio) {
+ if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ folio_put(folio);
+ goto next;
+ }
+ folio_throttle_swaprate(folio, gfp);
clear_huge_page(&folio->page, vmf->address, 1 << order);
return folio;
}
+next:
order = next_order(&orders, order);
}
fallback:
#endif
- return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address);
+ return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
}
/*
@@ -4298,10 +4503,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
nr_pages = folio_nr_pages(folio);
addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
- if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
- goto oom_free_page;
- folio_throttle_swaprate(folio, GFP_KERNEL);
-
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
@@ -4355,8 +4556,6 @@ unlock:
release:
folio_put(folio);
goto unlock;
-oom_free_page:
- folio_put(folio);
oom:
return VM_FAULT_OOM;
}
@@ -4480,7 +4679,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
folio_add_file_rmap_pmd(folio, page, vma);
/*
@@ -4543,7 +4742,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
folio_add_new_anon_rmap(folio, vma, addr);
folio_add_lru_vma(folio, vma);
} else {
- add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);
folio_add_file_rmap_ptes(folio, page, nr, vma);
}
set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
@@ -6163,7 +6362,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg)
{
struct page *page = arg;
- clear_user_highpage(page + idx, addr);
+ clear_user_highpage(nth_page(page, idx), addr);
return 0;
}
@@ -6213,10 +6412,11 @@ struct copy_subpage_arg {
static int copy_subpage(unsigned long addr, int idx, void *arg)
{
struct copy_subpage_arg *copy_arg = arg;
+ struct page *dst = nth_page(copy_arg->dst, idx);
+ struct page *src = nth_page(copy_arg->src, idx);
- if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
- addr, copy_arg->vma)) {
- memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+ if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) {
+ memory_failure_queue(page_to_pfn(src), 0);
return -EHWPOISON;
}
return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 21890994c1d3..a444e2d7dd2b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1087,7 +1087,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone)
+ struct zone *zone, bool mhp_off_inaccessible)
{
unsigned long end_pfn = pfn + nr_pages;
int ret, i;
@@ -1096,6 +1096,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (ret)
return ret;
+ /*
+ * Memory block is accessible at this stage and hence poison the struct
+ * pages now. If the memory block is accessible during memory hotplug
+ * addition phase, then page poisining is already performed in
+ * sparse_add_section().
+ */
+ if (mhp_off_inaccessible)
+ page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
+
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
for (i = 0; i < nr_pages; i++)
@@ -1328,7 +1337,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
}
#endif
-static bool mhp_supports_memmap_on_memory(unsigned long size)
+bool mhp_supports_memmap_on_memory(void)
{
unsigned long vmemmap_size = memory_block_memmap_size();
unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
@@ -1337,17 +1346,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
* Besides having arch support and the feature enabled at runtime, we
* need a few more assumptions to hold true:
*
- * a) We span a single memory block: memory onlining/offlinin;g happens
- * in memory block granularity. We don't want the vmemmap of online
- * memory blocks to reside on offline memory blocks. In the future,
- * we might want to support variable-sized memory blocks to make the
- * feature more versatile.
- *
- * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+ * a) The vmemmap pages span complete PMDs: We don't want vmemmap code
* to populate memory from the altmap for unrelated parts (i.e.,
* other memory blocks)
*
- * c) The vmemmap pages (and thereby the pages that will be exposed to
+ * b) The vmemmap pages (and thereby the pages that will be exposed to
* the buddy) have to cover full pageblocks: memory onlining/offlining
* code requires applicable ranges to be page-aligned, for example, to
* set the migratetypes properly.
@@ -1359,7 +1362,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
* altmap as an alternative source of memory, and we do not exactly
* populate a single PMD.
*/
- if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+ if (!mhp_memmap_on_memory())
return false;
/*
@@ -1382,6 +1385,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
return arch_supports_memmap_on_memory(vmemmap_size);
}
+EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
{
@@ -1415,7 +1419,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
}
static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
- u64 start, u64 size)
+ u64 start, u64 size, mhp_t mhp_flags)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1431,6 +1435,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
};
mhp_altmap.free = memory_block_memmap_on_memory_pages();
+ if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
+ mhp_altmap.inaccessible = true;
params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
GFP_KERNEL);
if (!params.altmap) {
@@ -1515,8 +1521,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* Self hosted memmap array
*/
if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
- mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
- ret = create_altmaps_and_memory_blocks(nid, group, start, size);
+ mhp_supports_memmap_on_memory()) {
+ ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
if (ret)
goto error;
} else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..56f9a6ed939a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -19,6 +19,13 @@
* for anonymous memory. For process policy an process counter
* is used.
*
+ * weighted interleave
+ * Allocate memory interleaved over a set of nodes based on
+ * a set of weights (per-node), with normal fallback if it
+ * fails. Otherwise operates the same as interleave.
+ * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
+ * on node 0 for every 1 page allocated on node 1.
+ *
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
@@ -131,6 +138,32 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+/*
+ * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
+ * system-default value should be used. A NULL iw_table also denotes that
+ * system-default values should be used. Until the system-default table
+ * is implemented, the system-default is always 1.
+ *
+ * iw_table is RCU protected
+ */
+static u8 __rcu *iw_table;
+static DEFINE_MUTEX(iw_table_lock);
+
+static u8 get_il_weight(int node)
+{
+ u8 *table;
+ u8 weight;
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ /* if no iw_table, use system default */
+ weight = table ? table[node] : 1;
+ /* if value in iw_table is 0, use system default */
+ weight = weight ? weight : 1;
+ rcu_read_unlock();
+ return weight;
+}
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -415,6 +448,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_preferred,
},
+ [MPOL_WEIGHTED_INTERLEAVE] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_nodemask,
+ },
};
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
@@ -654,7 +691,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
{
struct vm_area_struct *next, *vma = walk->vma;
struct queue_pages *qp = walk->private;
- unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
/* range check first */
@@ -682,9 +718,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
!(flags & MPOL_MF_STRICT))
return 1;
- if (endvma > end)
- endvma = end;
-
/*
* Check page nodes, and queue pages to move, in the current vma.
* But if no moving, and no strict checking, the scan can be skipped.
@@ -836,8 +869,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
old = current->mempolicy;
current->mempolicy = new;
- if (new && new->mode == MPOL_INTERLEAVE)
+ if (new && (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
current->il_prev = MAX_NUMNODES-1;
+ current->il_weight = 0;
+ }
task_unlock(current);
mpol_put(old);
ret = 0;
@@ -862,6 +898,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
*nodes = pol->nodes;
break;
case MPOL_LOCAL:
@@ -946,6 +983,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->nodes);
+ } else if (pol == current->mempolicy &&
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
+ if (current->il_weight)
+ *policy = current->il_prev;
+ else
+ *policy = next_node_in(current->il_prev,
+ pol->nodes);
} else {
err = -EINVAL;
goto out;
@@ -1310,7 +1354,8 @@ static long do_mbind(unsigned long start, unsigned long len,
* VMAs, the nodes will still be interleaved from the targeted
* nodemask, but one by one may be selected differently.
*/
- if (new->mode == MPOL_INTERLEAVE) {
+ if (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE) {
struct page *page;
unsigned int order;
unsigned long addr = -EFAULT;
@@ -1758,7 +1803,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
* @order: 0, or appropriate huge_page_order for interleaving
- * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
+ * MPOL_WEIGHTED_INTERLEAVE
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
@@ -1775,7 +1821,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
pol = __get_vma_policy(vma, addr, ilx);
if (!pol)
pol = get_task_policy(current);
- if (pol->mode == MPOL_INTERLEAVE) {
+ if (pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
*ilx += vma->vm_pgoff >> order;
*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
}
@@ -1825,12 +1872,40 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
return zone >= dynamic_policy_zone;
}
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+ unsigned int node;
+ unsigned int cpuset_mems_cookie;
+
+retry:
+ /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ node = current->il_prev;
+ if (!current->il_weight || !node_isset(node, policy->nodes)) {
+ node = next_node_in(node, policy->nodes);
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry;
+ if (node == MAX_NUMNODES)
+ return node;
+ current->il_prev = node;
+ current->il_weight = get_il_weight(node);
+ }
+ current->il_weight--;
+ return node;
+}
+
/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
unsigned int nid;
+ unsigned int cpuset_mems_cookie;
+
+ /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ nid = next_node_in(current->il_prev, policy->nodes);
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
- nid = next_node_in(current->il_prev, policy->nodes);
if (nid < MAX_NUMNODES)
current->il_prev = nid;
return nid;
@@ -1859,6 +1934,9 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
+ case MPOL_WEIGHTED_INTERLEAVE:
+ return weighted_interleave_nodes(policy);
+
case MPOL_BIND:
case MPOL_PREFERRED_MANY:
{
@@ -1883,6 +1961,59 @@ unsigned int mempolicy_slab_node(void)
}
}
+static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
+ nodemask_t *mask)
+{
+ /*
+ * barrier stabilizes the nodemask locally so that it can be iterated
+ * over safely without concern for changes. Allocators validate node
+ * selection does not violate mems_allowed, so this is safe.
+ */
+ barrier();
+ memcpy(mask, &pol->nodes, sizeof(nodemask_t));
+ barrier();
+ return nodes_weight(*mask);
+}
+
+static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
+{
+ nodemask_t nodemask;
+ unsigned int target, nr_nodes;
+ u8 *table;
+ unsigned int weight_total = 0;
+ u8 weight;
+ int nid;
+
+ nr_nodes = read_once_policy_nodemask(pol, &nodemask);
+ if (!nr_nodes)
+ return numa_node_id();
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ /* calculate the total weight */
+ for_each_node_mask(nid, nodemask) {
+ /* detect system default usage */
+ weight = table ? table[nid] : 1;
+ weight = weight ? weight : 1;
+ weight_total += weight;
+ }
+
+ /* Calculate the node offset based on totals */
+ target = ilx % weight_total;
+ nid = first_node(nodemask);
+ while (target) {
+ /* detect system default usage */
+ weight = table ? table[nid] : 1;
+ weight = weight ? weight : 1;
+ if (target < weight)
+ break;
+ target -= weight;
+ nid = next_node_in(nid, nodemask);
+ }
+ rcu_read_unlock();
+ return nid;
+}
+
/*
* Do static interleaving for interleave index @ilx. Returns the ilx'th
* node in pol->nodes (starting from ilx=0), wrapping around if ilx
@@ -1890,20 +2021,12 @@ unsigned int mempolicy_slab_node(void)
*/
static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
- nodemask_t nodemask = pol->nodes;
+ nodemask_t nodemask;
unsigned int target, nnodes;
int i;
int nid;
- /*
- * The barrier will stabilize the nodemask in a register or on
- * the stack so that it will stop changing under the code.
- *
- * Between first_node() and next_node(), pol->nodes could be changed
- * by other threads. So we put pol->nodes in a local stack.
- */
- barrier();
- nnodes = nodes_weight(nodemask);
+ nnodes = read_once_policy_nodemask(pol, &nodemask);
if (!nnodes)
return numa_node_id();
target = ilx % nnodes;
@@ -1951,6 +2074,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
*nid = (ilx == NO_INTERLEAVE_INDEX) ?
interleave_nodes(pol) : interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ *nid = (ilx == NO_INTERLEAVE_INDEX) ?
+ weighted_interleave_nodes(pol) :
+ weighted_interleave_nid(pol, ilx);
+ break;
}
return nodemask;
@@ -2012,6 +2140,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
*mask = mempolicy->nodes;
break;
@@ -2112,6 +2241,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode != MPOL_INTERLEAVE &&
+ pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
(!nodemask || node_isset(nid, *nodemask))) {
/*
* First, try to allocate THP only on local node, but
@@ -2247,6 +2377,121 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
return total_allocated;
}
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ struct task_struct *me = current;
+ unsigned int cpuset_mems_cookie;
+ unsigned long total_allocated = 0;
+ unsigned long nr_allocated = 0;
+ unsigned long rounds;
+ unsigned long node_pages, delta;
+ u8 *table, *weights, weight;
+ unsigned int weight_total = 0;
+ unsigned long rem_pages = nr_pages;
+ nodemask_t nodes;
+ int nnodes, node;
+ int resume_node = MAX_NUMNODES - 1;
+ u8 resume_weight = 0;
+ int prev_node;
+ int i;
+
+ if (!nr_pages)
+ return 0;
+
+ /* read the nodes onto the stack, retry if done during rebind */
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ nnodes = read_once_policy_nodemask(pol, &nodes);
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+ /* if the nodemask has become invalid, we cannot do anything */
+ if (!nnodes)
+ return 0;
+
+ /* Continue allocating from most recent node and adjust the nr_pages */
+ node = me->il_prev;
+ weight = me->il_weight;
+ if (weight && node_isset(node, nodes)) {
+ node_pages = min(rem_pages, weight);
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ /* if that's all the pages, no need to interleave */
+ if (rem_pages <= weight) {
+ me->il_weight -= rem_pages;
+ return total_allocated;
+ }
+ /* Otherwise we adjust remaining pages, continue from there */
+ rem_pages -= weight;
+ }
+ /* clear active weight in case of an allocation failure */
+ me->il_weight = 0;
+ prev_node = node;
+
+ /* create a local copy of node weights to operate on outside rcu */
+ weights = kzalloc(nr_node_ids, GFP_KERNEL);
+ if (!weights)
+ return total_allocated;
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ if (table)
+ memcpy(weights, table, nr_node_ids);
+ rcu_read_unlock();
+
+ /* calculate total, detect system default usage */
+ for_each_node_mask(node, nodes) {
+ if (!weights[node])
+ weights[node] = 1;
+ weight_total += weights[node];
+ }
+
+ /*
+ * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
+ * Track which node weighted interleave should resume from.
+ *
+ * if (rounds > 0) and (delta == 0), resume_node will always be
+ * the node following prev_node and its weight.
+ */
+ rounds = rem_pages / weight_total;
+ delta = rem_pages % weight_total;
+ resume_node = next_node_in(prev_node, nodes);
+ resume_weight = weights[resume_node];
+ for (i = 0; i < nnodes; i++) {
+ node = next_node_in(prev_node, nodes);
+ weight = weights[node];
+ node_pages = weight * rounds;
+ /* If a delta exists, add this node's portion of the delta */
+ if (delta > weight) {
+ node_pages += weight;
+ delta -= weight;
+ } else if (delta) {
+ /* when delta is depleted, resume from that node */
+ node_pages += delta;
+ resume_node = node;
+ resume_weight = weight - delta;
+ delta = 0;
+ }
+ /* node_pages can be 0 if an allocation fails and rounds == 0 */
+ if (!node_pages)
+ break;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ if (total_allocated == nr_pages)
+ break;
+ prev_node = node;
+ }
+ me->il_prev = resume_node;
+ me->il_weight = resume_weight;
+ kfree(weights);
+ return total_allocated;
+}
+
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
@@ -2287,6 +2532,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
return alloc_pages_bulk_array_interleave(gfp, pol,
nr_pages, page_array);
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return alloc_pages_bulk_array_weighted_interleave(
+ gfp, pol, nr_pages, page_array);
+
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
@@ -2362,6 +2611,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
@@ -2498,6 +2748,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
polnid = interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ polnid = weighted_interleave_nid(pol, ilx);
+ break;
+
case MPOL_PREFERRED:
if (node_isset(curnid, pol->nodes))
goto out;
@@ -2872,6 +3126,7 @@ static const char * const policy_modes[] =
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
+ [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
[MPOL_LOCAL] = "local",
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2931,6 +3186,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
}
break;
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
@@ -3041,6 +3297,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
nodes = pol->nodes;
break;
default:
@@ -3067,3 +3324,200 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+ struct kobj_attribute kobj_attr;
+ int nid;
+};
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct iw_node_attr *node_attr;
+ u8 weight;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ weight = get_il_weight(node_attr->nid);
+ return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct iw_node_attr *node_attr;
+ u8 *new;
+ u8 *old;
+ u8 weight = 0;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ if (count == 0 || sysfs_streq(buf, ""))
+ weight = 0;
+ else if (kstrtou8(buf, 0, &weight))
+ return -EINVAL;
+
+ new = kzalloc(nr_node_ids, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ mutex_lock(&iw_table_lock);
+ old = rcu_dereference_protected(iw_table,
+ lockdep_is_held(&iw_table_lock));
+ if (old)
+ memcpy(new, old, nr_node_ids);
+ new[node_attr->nid] = weight;
+ rcu_assign_pointer(iw_table, new);
+ mutex_unlock(&iw_table_lock);
+ synchronize_rcu();
+ kfree(old);
+ return count;
+}
+
+static struct iw_node_attr **node_attrs;
+
+static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
+ struct kobject *parent)
+{
+ if (!node_attr)
+ return;
+ sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
+ kfree(node_attr->kobj_attr.attr.name);
+ kfree(node_attr);
+}
+
+static void sysfs_wi_release(struct kobject *wi_kobj)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++)
+ sysfs_wi_node_release(node_attrs[i], wi_kobj);
+ kobject_put(wi_kobj);
+}
+
+static const struct kobj_type wi_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = sysfs_wi_release,
+};
+
+static int add_weight_node(int nid, struct kobject *wi_kobj)
+{
+ struct iw_node_attr *node_attr;
+ char *name;
+
+ node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
+ if (!node_attr)
+ return -ENOMEM;
+
+ name = kasprintf(GFP_KERNEL, "node%d", nid);
+ if (!name) {
+ kfree(node_attr);
+ return -ENOMEM;
+ }
+
+ sysfs_attr_init(&node_attr->kobj_attr.attr);
+ node_attr->kobj_attr.attr.name = name;
+ node_attr->kobj_attr.attr.mode = 0644;
+ node_attr->kobj_attr.show = node_show;
+ node_attr->kobj_attr.store = node_store;
+ node_attr->nid = nid;
+
+ if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+ kfree(node_attr->kobj_attr.attr.name);
+ kfree(node_attr);
+ pr_err("failed to add attribute to weighted_interleave\n");
+ return -ENOMEM;
+ }
+
+ node_attrs[nid] = node_attr;
+ return 0;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+ struct kobject *wi_kobj;
+ int nid, err;
+
+ wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!wi_kobj)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+ "weighted_interleave");
+ if (err) {
+ kfree(wi_kobj);
+ return err;
+ }
+
+ for_each_node_state(nid, N_POSSIBLE) {
+ err = add_weight_node(nid, wi_kobj);
+ if (err) {
+ pr_err("failed to add sysfs [node%d]\n", nid);
+ break;
+ }
+ }
+ if (err)
+ kobject_put(wi_kobj);
+ return 0;
+}
+
+static void mempolicy_kobj_release(struct kobject *kobj)
+{
+ u8 *old;
+
+ mutex_lock(&iw_table_lock);
+ old = rcu_dereference_protected(iw_table,
+ lockdep_is_held(&iw_table_lock));
+ rcu_assign_pointer(iw_table, NULL);
+ mutex_unlock(&iw_table_lock);
+ synchronize_rcu();
+ kfree(old);
+ kfree(node_attrs);
+ kfree(kobj);
+}
+
+static const struct kobj_type mempolicy_ktype = {
+ .release = mempolicy_kobj_release
+};
+
+static int __init mempolicy_sysfs_init(void)
+{
+ int err;
+ static struct kobject *mempolicy_kobj;
+
+ mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
+ if (!mempolicy_kobj) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
+ GFP_KERNEL);
+ if (!node_attrs) {
+ err = -ENOMEM;
+ goto mempol_out;
+ }
+
+ err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
+ "mempolicy");
+ if (err)
+ goto node_out;
+
+ err = add_weighted_interleave_group(mempolicy_kobj);
+ if (err) {
+ pr_err("mempolicy sysfs structure failed to initialize\n");
+ kobject_put(mempolicy_kobj);
+ return err;
+ }
+
+ return err;
+node_out:
+ kfree(node_attrs);
+mempol_out:
+ kfree(mempolicy_kobj);
+err_out:
+ pr_err("failed to add mempolicy kobject to the system\n");
+ return err;
+}
+
+late_initcall(mempolicy_sysfs_init);
+#endif /* CONFIG_SYSFS */
diff --git a/mm/migrate.c b/mm/migrate.c
index c27b1f8097d4..73a052a382f1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -211,14 +211,17 @@ static bool remove_migration_pte(struct folio *folio,
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
old_pte = ptep_get(pvmw.pte);
- if (pte_swp_soft_dirty(old_pte))
- pte = pte_mksoft_dirty(pte);
entry = pte_to_swp_entry(old_pte);
if (!is_migration_entry_young(entry))
pte = pte_mkold(pte);
if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
pte = pte_mkdirty(pte);
+ if (pte_swp_soft_dirty(old_pte))
+ pte = pte_mksoft_dirty(pte);
+ else
+ pte = pte_clear_soft_dirty(pte);
+
if (is_writable_migration_entry(entry))
pte = pte_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(old_pte))
diff --git a/mm/mmap.c b/mm/mmap.c
index d89770eaab6b..ccf377ee319f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -105,7 +105,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
* Requires inode->i_mapping->i_mmap_rwsem
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
- struct file *file, struct address_space *mapping)
+ struct address_space *mapping)
{
if (vma_is_shared_maywrite(vma))
mapping_unmap_writable(mapping);
@@ -126,7 +126,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
if (file) {
struct address_space *mapping = file->f_mapping;
i_mmap_lock_write(mapping);
- __remove_shared_vm_struct(vma, file, mapping);
+ __remove_shared_vm_struct(vma, mapping);
i_mmap_unlock_write(mapping);
}
}
@@ -392,26 +392,30 @@ static void __vma_link_file(struct vm_area_struct *vma,
flush_dcache_mmap_unlock(mapping);
}
+static void vma_link_file(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping;
+
+ if (file) {
+ mapping = file->f_mapping;
+ i_mmap_lock_write(mapping);
+ __vma_link_file(vma, mapping);
+ i_mmap_unlock_write(mapping);
+ }
+}
+
static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
{
VMA_ITERATOR(vmi, mm, 0);
- struct address_space *mapping = NULL;
vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
vma_start_write(vma);
-
vma_iter_store(&vmi, vma);
-
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
- __vma_link_file(vma, mapping);
- i_mmap_unlock_write(mapping);
- }
-
+ vma_link_file(vma);
mm->map_count++;
validate_mm(mm);
return 0;
@@ -519,10 +523,9 @@ static inline void vma_complete(struct vma_prepare *vp,
}
if (vp->remove && vp->file) {
- __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping);
+ __remove_shared_vm_struct(vp->remove, vp->mapping);
if (vp->remove2)
- __remove_shared_vm_struct(vp->remove2, vp->file,
- vp->mapping);
+ __remove_shared_vm_struct(vp->remove2, vp->mapping);
} else if (vp->insert) {
/*
* split_vma has split insert from vma, and needs
@@ -660,9 +663,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_prepare(&vp);
vma_adjust_trans_huge(vma, start, end, 0);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_pgoff = pgoff;
+ vma_set_range(vma, start, end, pgoff);
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, vma->vm_mm);
@@ -705,9 +706,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_adjust_trans_huge(vma, start, end, 0);
vma_iter_clear(vmi);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_pgoff = pgoff;
+ vma_set_range(vma, start, end, pgoff);
vma_complete(&vp, vmi, vma->vm_mm);
return 0;
}
@@ -861,13 +860,15 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* area is returned, or the function will return NULL
*/
static struct vm_area_struct
-*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
- struct vm_area_struct *prev, unsigned long addr, unsigned long end,
- unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy,
+*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
+ struct vm_area_struct *src, unsigned long addr, unsigned long end,
+ unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
struct anon_vma_name *anon_name)
{
+ struct mm_struct *mm = src->vm_mm;
+ struct anon_vma *anon_vma = src->anon_vma;
+ struct file *file = src->vm_file;
struct vm_area_struct *curr, *next, *res;
struct vm_area_struct *vma, *adjust, *remove, *remove2;
struct vm_area_struct *anon_dup = NULL;
@@ -1012,10 +1013,7 @@ static struct vm_area_struct
vma_prepare(&vp);
vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
-
- vma->vm_start = vma_start;
- vma->vm_end = vma_end;
- vma->vm_pgoff = vma_pgoff;
+ vma_set_range(vma, vma_start, vma_end, vma_pgoff);
if (vma_expanded)
vma_iter_store(vmi, vma);
@@ -2048,7 +2046,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
validate_mm(mm);
return error;
@@ -2142,7 +2139,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
validate_mm(mm);
return error;
@@ -2432,9 +2428,8 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
struct vm_area_struct *merged;
- merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, policy,
- uffd_ctx, anon_name);
+ merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
+ pgoff, policy, uffd_ctx, anon_name);
if (merged)
return merged;
@@ -2464,9 +2459,8 @@ static struct vm_area_struct
struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff)
{
- return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
+ vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}
/*
@@ -2480,10 +2474,9 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
/* vma is specified as prev, so case 1 or 2 will apply. */
- return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta,
- vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma), vma->vm_userfaultfd_ctx,
- anon_vma_name(vma));
+ return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
+ vma->vm_flags, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}
/*
@@ -2810,11 +2803,9 @@ cannot_expand:
}
vma_iter_config(&vmi, addr, end);
- vma->vm_start = addr;
- vma->vm_end = end;
+ vma_set_range(vma, addr, end, pgoff);
vm_flags_init(vma, vm_flags);
vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
if (file) {
vma->vm_file = get_file(file);
@@ -2891,16 +2882,7 @@ cannot_expand:
vma_start_write(vma);
vma_iter_store(&vmi, vma);
mm->map_count++;
- if (vma->vm_file) {
- i_mmap_lock_write(vma->vm_file->f_mapping);
- if (vma_is_shared_maywrite(vma))
- mapping_allow_writable(vma->vm_file->f_mapping);
-
- flush_dcache_mmap_lock(vma->vm_file->f_mapping);
- vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
- flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- }
+ vma_link_file(vma);
/*
* vma_merge() calls khugepaged_enter_vma() either, the below
@@ -3173,9 +3155,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto unacct_fail;
vma_set_anonymous(vma);
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_pgoff = addr >> PAGE_SHIFT;
+ vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
vm_flags_init(vma, flags);
vma->vm_page_prot = vm_get_page_prot(flags);
vma_start_write(vma);
@@ -3412,9 +3392,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma = vm_area_dup(vma);
if (!new_vma)
goto out;
- new_vma->vm_start = addr;
- new_vma->vm_end = addr + len;
- new_vma->vm_pgoff = pgoff;
+ vma_set_range(new_vma, addr, addr + len, pgoff);
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
if (anon_vma_clone(new_vma, vma))
@@ -3582,9 +3560,7 @@ static struct vm_area_struct *__install_special_mapping(
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
- vma->vm_start = addr;
- vma->vm_end = addr + len;
-
+ vma_set_range(vma, addr, addr + len, 0);
vm_flags_init(vma, (vm_flags | mm->def_flags |
VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
@@ -3868,7 +3844,7 @@ static int init_user_reserve(void)
free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
- sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
return 0;
}
subsys_initcall(init_user_reserve);
@@ -3889,7 +3865,7 @@ static int init_admin_reserve(void)
free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
- sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
return 0;
}
subsys_initcall(init_admin_reserve);
@@ -3921,12 +3897,12 @@ static int reserve_mem_notifier(struct notifier_block *nb,
case MEM_ONLINE:
/* Default max is 128MB. Leave alone if modified by operator. */
tmp = sysctl_user_reserve_kbytes;
- if (0 < tmp && tmp < (1UL << 17))
+ if (tmp > 0 && tmp < SZ_128K)
init_user_reserve();
/* Default max is 8MB. Leave alone if modified by operator. */
tmp = sysctl_admin_reserve_kbytes;
- if (0 < tmp && tmp < (1UL << 13))
+ if (tmp > 0 && tmp < SZ_8K)
init_admin_reserve();
break;
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 604ddf08affe..99b3e9408aa0 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -50,12 +50,21 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
#ifdef CONFIG_SMP
static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
{
+ struct encoded_page **pages = batch->encoded_pages;
+
for (int i = 0; i < batch->nr; i++) {
- struct encoded_page *enc = batch->encoded_pages[i];
+ struct encoded_page *enc = pages[i];
- if (encoded_page_flags(enc)) {
+ if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
struct page *page = encoded_page_ptr(enc);
- folio_remove_rmap_pte(page_folio(page), page, vma);
+ unsigned int nr_pages = 1;
+
+ if (unlikely(encoded_page_flags(enc) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_pages = encoded_nr_pages(pages[++i]);
+
+ folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
+ vma);
}
}
}
@@ -82,26 +91,62 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
}
#endif
-static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+/*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
+ */
+#define MAX_NR_FOLIOS_PER_FREE 512
+
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
{
- struct mmu_gather_batch *batch;
+ struct encoded_page **pages = batch->encoded_pages;
+ unsigned int nr, nr_pages;
- for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- struct encoded_page **pages = batch->encoded_pages;
+ while (batch->nr) {
+ if (!page_poisoning_enabled_static() && !want_init_on_free()) {
+ nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
- do {
/*
- * limit free batch count when PAGE_SIZE > 4K
+ * Make sure we cover page + nr_pages, and don't leave
+ * nr_pages behind when capping the number of entries.
+ */
+ if (unlikely(encoded_page_flags(pages[nr - 1]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr++;
+ } else {
+ /*
+ * With page poisoning and init_on_free, the time it
+ * takes to free memory grows proportionally with the
+ * actual memory size. Therefore, limit based on the
+ * actual memory size and not the number of involved
+ * folios.
*/
- unsigned int nr = min(512U, batch->nr);
+ for (nr = 0, nr_pages = 0;
+ nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
+ nr++) {
+ if (unlikely(encoded_page_flags(pages[nr]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_pages += encoded_nr_pages(pages[++nr]);
+ else
+ nr_pages++;
+ }
+ }
- free_pages_and_swap_cache(pages, nr);
- pages += nr;
- batch->nr -= nr;
+ free_pages_and_swap_cache(pages, nr);
+ pages += nr;
+ batch->nr -= nr;
- cond_resched();
- } while (batch->nr);
+ cond_resched();
}
+}
+
+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
+ __tlb_batch_free_encoded_pages(batch);
tlb->active = &tlb->local;
}
@@ -116,14 +161,19 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
tlb->local.next = NULL;
}
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size)
+static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
+ struct page *page, unsigned int nr_pages, bool delay_rmap,
+ int page_size)
{
+ int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
struct mmu_gather_batch *batch;
VM_BUG_ON(!tlb->end);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
VM_WARN_ON(tlb->page_size != page_size);
+ VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
+ VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
#endif
batch = tlb->active;
@@ -131,17 +181,40 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, i
* Add the page and check if we are full. If so
* force a flush.
*/
- batch->encoded_pages[batch->nr++] = page;
- if (batch->nr == batch->max) {
+ if (likely(nr_pages == 1)) {
+ batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+ } else {
+ flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
+ batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+ batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
+ }
+ /*
+ * Make sure that we can always add another "page" + "nr_pages",
+ * requiring two entries instead of only a single one.
+ */
+ if (batch->nr >= batch->max - 1) {
if (!tlb_next_batch(tlb))
return true;
batch = tlb->active;
}
- VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
+ VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);
return false;
}
+bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
+ unsigned int nr_pages, bool delay_rmap)
+{
+ return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
+ PAGE_SIZE);
+}
+
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+ bool delay_rmap, int page_size)
+{
+ return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size);
+}
+
#endif /* MMU_GATHER_NO_GATHER */
#ifdef CONFIG_MMU_GATHER_TABLE_FREE
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 81991102f785..f8a4544b4601 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -198,13 +198,13 @@ static long change_pte_range(struct mmu_gather *tlb,
pte_t newpte;
if (is_writable_migration_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = pfn_swap_entry_folio(entry);
/*
* A protection check is difficult so
* just be safe and disable write
*/
- if (PageAnon(page))
+ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 150d4f23b010..9faca05d124e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5572,37 +5572,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
mutex_unlock(&pcp_batch_high_lock);
}
-static void zone_pcp_update_cacheinfo(struct zone *zone)
+static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
{
- int cpu;
struct per_cpu_pages *pcp;
struct cpu_cacheinfo *cci;
- for_each_online_cpu(cpu) {
- pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- cci = get_cpu_cacheinfo(cpu);
- /*
- * If data cache slice of CPU is large enough, "pcp->batch"
- * pages can be preserved in PCP before draining PCP for
- * consecutive high-order pages freeing without allocation.
- * This can reduce zone lock contention without hurting
- * cache-hot pages sharing.
- */
- spin_lock(&pcp->lock);
- if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
- pcp->flags |= PCPF_FREE_HIGH_BATCH;
- else
- pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
- spin_unlock(&pcp->lock);
- }
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ cci = get_cpu_cacheinfo(cpu);
+ /*
+ * If data cache slice of CPU is large enough, "pcp->batch"
+ * pages can be preserved in PCP before draining PCP for
+ * consecutive high-order pages freeing without allocation.
+ * This can reduce zone lock contention without hurting
+ * cache-hot pages sharing.
+ */
+ spin_lock(&pcp->lock);
+ if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+ pcp->flags |= PCPF_FREE_HIGH_BATCH;
+ else
+ pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+ spin_unlock(&pcp->lock);
}
-void setup_pcp_cacheinfo(void)
+void setup_pcp_cacheinfo(unsigned int cpu)
{
struct zone *zone;
for_each_populated_zone(zone)
- zone_pcp_update_cacheinfo(zone);
+ zone_pcp_update_cacheinfo(zone, cpu);
}
/*
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 03c1bdae4a43..106e1d66e9f9 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
+#include <linux/debugfs.h>
#include <linux/ptdump.h>
#include <linux/kasan.h>
@@ -163,3 +164,24 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
/* Flush out the last page */
st->note_page(st, 0, -1, 0);
}
+
+static int check_wx_show(struct seq_file *m, void *v)
+{
+ if (ptdump_check_wx())
+ seq_puts(m, "SUCCESS\n");
+ else
+ seq_puts(m, "FAILED\n");
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(check_wx);
+
+static int ptdump_debugfs_init(void)
+{
+ debugfs_create_file("check_wx_pages", 0400, NULL, NULL, &check_wx_fops);
+
+ return 0;
+}
+
+device_initcall(ptdump_debugfs_init);
diff --git a/mm/readahead.c b/mm/readahead.c
index 2648ec4f0494..1e74455f908e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -500,10 +500,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
if (new_order < MAX_PAGECACHE_ORDER) {
new_order += 2;
- if (new_order > MAX_PAGECACHE_ORDER)
- new_order = MAX_PAGECACHE_ORDER;
- while ((1 << new_order) > ra->size)
- new_order--;
+ new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
+ new_order = min_t(unsigned int, new_order, ilog2(ra->size));
}
filemap_invalidate_lock_shared(mapping);
diff --git a/mm/rmap.c b/mm/rmap.c
index f5d43edad529..3746a5531018 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1780,7 +1780,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
set_huge_pte_at(mm, address, pvmw.pte, pteval,
hsz);
} else {
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
@@ -1795,7 +1795,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* migration) will not expect userfaults on already
* copied pages.
*/
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
} else if (folio_test_anon(folio)) {
swp_entry_t entry = page_swap_entry(subpage);
pte_t swp_pte;
@@ -1903,7 +1903,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*
* See Documentation/mm/mmu_notifier.rst
*/
- dec_mm_counter(mm, mm_counter_file(&folio->page));
+ dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
if (unlikely(folio_test_hugetlb(folio)))
@@ -2169,7 +2169,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
- compound_order(&folio->page));
+ folio_order(folio));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
@@ -2181,7 +2181,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
set_huge_pte_at(mm, address, pvmw.pte, pteval,
hsz);
} else {
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
@@ -2196,7 +2196,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* migration) will not expect userfaults on already
* copied pages.
*/
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
} else {
swp_entry_t entry;
pte_t swp_pte;
@@ -2261,7 +2261,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
set_pte_at(mm, address, pvmw.pte, swp_pte);
trace_set_migration_pte(address, pte_val(swp_pte),
- compound_order(&folio->page));
+ folio_order(folio));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
diff --git a/mm/sparse.c b/mm/sparse.c
index 338cf946dee8..aed0951b87fa 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -908,7 +908,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- page_init_poison(memmap, sizeof(struct page) * nr_pages);
+ if (!altmap || !altmap->inaccessible)
+ page_init_poison(memmap, sizeof(struct page) * nr_pages);
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
diff --git a/mm/swap.c b/mm/swap.c
index cd8f0150ba3a..e5380d732c0d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -967,11 +967,17 @@ void release_pages(release_pages_arg arg, int nr)
unsigned int lock_batch;
for (i = 0; i < nr; i++) {
+ unsigned int nr_refs = 1;
struct folio *folio;
/* Turn any of the argument types into a folio */
folio = page_folio(encoded_page_ptr(encoded[i]));
+ /* Is our next entry actually "nr_pages" -> "nr_refs" ? */
+ if (unlikely(encoded_page_flags(encoded[i]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_refs = encoded_nr_pages(encoded[++i]);
+
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
@@ -990,14 +996,14 @@ void release_pages(release_pages_arg arg, int nr)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_page(&folio->page))
+ if (put_devmap_managed_page_refs(&folio->page, nr_refs))
continue;
- if (folio_put_testzero(folio))
+ if (folio_ref_sub_and_test(folio, nr_refs))
free_zone_device_page(&folio->page);
continue;
}
- if (!folio_put_testzero(folio))
+ if (!folio_ref_sub_and_test(folio, nr_refs))
continue;
if (folio_test_large(folio)) {
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0bec1f705f8e..90973ce7881d 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -273,6 +273,9 @@ void free_swap_slot(swp_entry_t entry)
{
struct swap_slots_cache *cache;
+ /* Large folio swap slot is not covered. */
+ zswap_invalidate(entry);
+
cache = raw_cpu_ptr(&swp_slots);
if (likely(use_swap_slot_cache && cache->slots_ret)) {
spin_lock_irq(&cache->free_lock);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7255c01a1e4e..2f540748f7c0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -311,8 +311,19 @@ void free_page_and_swap_cache(struct page *page)
void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
{
lru_add_drain();
- for (int i = 0; i < nr; i++)
- free_swap_cache(encoded_page_ptr(pages[i]));
+ for (int i = 0; i < nr; i++) {
+ struct page *page = encoded_page_ptr(pages[i]);
+
+ /*
+ * Skip over the "nr_pages" entry. It's sufficient to call
+ * free_swap_cache() only once per folio.
+ */
+ if (unlikely(encoded_page_flags(pages[i]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ i++;
+
+ free_swap_cache(page);
+ }
release_pages(pages, nr);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 746aa9da5302..d1bd8d1e17bd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
if (was_full && (si->flags & SWP_WRITEOK))
add_to_avail_list(si);
}
- atomic_long_add(nr_entries, &nr_swap_pages);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -746,12 +744,19 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify = NULL;
while (offset <= end) {
arch_swap_invalidate_page(si->type, offset);
- zswap_invalidate(si->type, offset);
if (swap_slot_free_notify)
swap_slot_free_notify(si->bdev, offset);
offset++;
}
clear_shadow_from_swap_cache(si->type, begin, end);
+
+ /*
+ * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
+ * only after the above cleanups are done.
+ */
+ smp_wmb();
+ atomic_long_add(nr_entries, &nr_swap_pages);
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
}
static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
@@ -2049,7 +2054,7 @@ static int try_to_unuse(unsigned int type)
unsigned int i;
if (!READ_ONCE(si->inuse_pages))
- return 0;
+ goto success;
retry:
retval = shmem_unuse(type);
@@ -2130,6 +2135,12 @@ retry:
return -EINTR;
}
+success:
+ /*
+ * Make sure that further cleanups after try_to_unuse() returns happen
+ * after swap_range_free() reduces si->inuse_pages to 0.
+ */
+ smp_mb();
return 0;
}
@@ -2348,8 +2359,6 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- zswap_swapon(p->type);
-
spin_lock(&swap_lock);
spin_lock(&p->lock);
setup_swap_info(p, prio, swap_map, cluster_info);
@@ -3167,6 +3176,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap_unlock_inode;
+ error = zswap_swapon(p->type, maxpages);
+ if (error)
+ goto free_swap_address_space;
+
/*
* Flush any pending IO and dirty mappings before we start using this
* swap device.
@@ -3175,7 +3188,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = inode_drain_writes(inode);
if (error) {
inode->i_flags &= ~S_SWAPFILE;
- goto free_swap_address_space;
+ goto free_swap_zswap;
}
mutex_lock(&swapon_mutex);
@@ -3199,6 +3212,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = 0;
goto out;
+free_swap_zswap:
+ zswap_swapoff(p->type);
free_swap_address_space:
exit_swap_address_space(p->type);
bad_swap_unlock_inode:
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7cf7d4384259..4744d6a96f96 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -20,19 +20,11 @@
#include "internal.h"
static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len)
+bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
{
- /*
- * Make sure that the dst range is both valid and fully within a
- * single existing vma.
- */
- struct vm_area_struct *dst_vma;
-
- dst_vma = find_vma(dst_mm, dst_start);
- if (!range_in_vma(dst_vma, dst_start, dst_start + len))
- return NULL;
+ /* Make sure that the dst range is fully within dst_vma. */
+ if (dst_end > dst_vma->vm_end)
+ return false;
/*
* Check the vma is registered in uffd, this is required to
@@ -40,11 +32,122 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
* time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
- return NULL;
+ return false;
+
+ return true;
+}
+
+static __always_inline
+struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ vma = ERR_PTR(-ENOENT);
+ else if (!(vma->vm_flags & VM_SHARED) &&
+ unlikely(anon_vma_prepare(vma)))
+ vma = ERR_PTR(-ENOMEM);
+
+ return vma;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * lock_vma() - Lookup and lock vma corresponding to @address.
+ * @mm: mm to search vma in.
+ * @address: address that the vma should contain.
+ *
+ * Should be called without holding mmap_lock. vma should be unlocked after use
+ * with unlock_vma().
+ *
+ * Return: A locked vma containing @address, -ENOENT if no vma is found, or
+ * -ENOMEM if anon_vma couldn't be allocated.
+ */
+static struct vm_area_struct *lock_vma(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct vm_area_struct *vma;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (vma) {
+ /*
+ * lock_vma_under_rcu() only checks anon_vma for private
+ * anonymous mappings. But we need to ensure it is assigned in
+ * private file-backed vmas as well.
+ */
+ if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
+ vma_end_read(vma);
+ else
+ return vma;
+ }
+
+ mmap_read_lock(mm);
+ vma = find_vma_and_prepare_anon(mm, address);
+ if (!IS_ERR(vma)) {
+ /*
+ * We cannot use vma_start_read() as it may fail due to
+ * false locked (see comment in vma_start_read()). We
+ * can avoid that by directly locking vm_lock under
+ * mmap_lock, which guarantees that nobody can lock the
+ * vma for write (vma_start_write()) under us.
+ */
+ down_read(&vma->vm_lock->lock);
+ }
+
+ mmap_read_unlock(mm);
+ return vma;
+}
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ struct vm_area_struct *dst_vma;
+
+ dst_vma = lock_vma(dst_mm, dst_start);
+ if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
+ return dst_vma;
+
+ vma_end_read(dst_vma);
+ return ERR_PTR(-ENOENT);
+}
+
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+ vma_end_read(vma);
+}
+#else
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ struct vm_area_struct *dst_vma;
+
+ mmap_read_lock(dst_mm);
+ dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
+ if (IS_ERR(dst_vma))
+ goto out_unlock;
+
+ if (validate_dst_vma(dst_vma, dst_start + len))
+ return dst_vma;
+
+ dst_vma = ERR_PTR(-ENOENT);
+out_unlock:
+ mmap_read_unlock(dst_mm);
return dst_vma;
}
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+ mmap_read_unlock(vma->vm_mm);
+}
+#endif
+
/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
unsigned long dst_addr)
@@ -124,7 +227,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
* Must happen after rmap, as mm_counter() checks mapping (via
* PageAnon()), which is set by __page_set_anon_rmap().
*/
- inc_mm_counter(dst_mm, mm_counter(page));
+ inc_mm_counter(dst_mm, mm_counter(folio));
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -350,18 +453,18 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
* mfill_atomic processing for HUGETLB vmas. Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
*/
static __always_inline ssize_t mfill_atomic_hugetlb(
+ struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
- int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
unsigned long src_addr, dst_addr;
@@ -379,7 +482,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
* feature is not supported.
*/
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
return -EINVAL;
}
@@ -402,24 +506,28 @@ retry:
* retry, dst_vma will be set to NULL and we must lookup again.
*/
if (!dst_vma) {
+ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+ if (IS_ERR(dst_vma)) {
+ err = PTR_ERR(dst_vma);
+ goto out;
+ }
+
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
- goto out_unlock;
+ if (!is_vm_hugetlb_page(dst_vma))
+ goto out_unlock_vma;
err = -EINVAL;
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
- goto out_unlock;
+ goto out_unlock_vma;
- vm_shared = dst_vma->vm_flags & VM_SHARED;
- }
-
- /*
- * If not shared, ensure the dst_vma has a anon_vma.
- */
- err = -ENOMEM;
- if (!vm_shared) {
- if (unlikely(anon_vma_prepare(dst_vma)))
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ down_read(&ctx->map_changing_lock);
+ err = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
}
@@ -463,7 +571,8 @@ retry:
cond_resched();
if (unlikely(err == -ENOENT)) {
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
BUG_ON(!folio);
err = copy_folio_from_user(folio,
@@ -472,16 +581,6 @@ retry:
err = -EFAULT;
goto out;
}
- mmap_read_lock(dst_mm);
- /*
- * If memory mappings are changing because of non-cooperative
- * operation (e.g. mremap) running in parallel, bail out and
- * request the user to retry later
- */
- if (mmap_changing && atomic_read(mmap_changing)) {
- err = -EAGAIN;
- break;
- }
dst_vma = NULL;
goto retry;
@@ -501,7 +600,9 @@ retry:
}
out_unlock:
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+out_unlock_vma:
+ uffd_mfill_unlock(dst_vma);
out:
if (folio)
folio_put(folio);
@@ -512,11 +613,11 @@ out:
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
-extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
@@ -564,13 +665,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
return err;
}
-static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags)
{
+ struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma;
ssize_t err;
pmd_t *dst_pmd;
@@ -593,24 +694,24 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
copied = 0;
folio = NULL;
retry:
- mmap_read_lock(dst_mm);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+ if (IS_ERR(dst_vma)) {
+ err = PTR_ERR(dst_vma);
+ goto out;
+ }
/*
* If memory mappings are changing because of non-cooperative
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
- goto out_unlock;
-
- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma)
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -EINVAL;
@@ -633,8 +734,8 @@ retry:
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
- len, mmap_changing, flags);
+ return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
+ src_start, len, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -642,16 +743,6 @@ retry:
uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
goto out_unlock;
- /*
- * Ensure the dst_vma has a anon_vma or this page
- * would get a NULL anon_vma when moved in the
- * dst_vma.
- */
- err = -ENOMEM;
- if (!(dst_vma->vm_flags & VM_SHARED) &&
- unlikely(anon_vma_prepare(dst_vma)))
- goto out_unlock;
-
while (src_addr < src_start + len) {
pmd_t dst_pmdval;
@@ -693,7 +784,8 @@ retry:
if (unlikely(err == -ENOENT)) {
void *kaddr;
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
BUG_ON(!folio);
kaddr = kmap_local_folio(folio, 0);
@@ -723,7 +815,8 @@ retry:
}
out_unlock:
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
out:
if (folio)
folio_put(folio);
@@ -733,34 +826,33 @@ out:
return copied ? copied : err;
}
-ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, uffd_flags_t flags)
+ uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
+ return mfill_atomic(ctx, dst_start, src_start, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
}
-ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long start,
+ unsigned long len)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
}
-ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags)
+ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
-ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags)
+ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
}
@@ -793,10 +885,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
return ret;
}
-int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool enable_wp,
- atomic_t *mmap_changing)
+int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_wp)
{
+ struct mm_struct *dst_mm = ctx->mm;
unsigned long end = start + len;
unsigned long _start, _end;
struct vm_area_struct *dst_vma;
@@ -820,8 +912,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -ENOENT;
@@ -850,6 +943,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = 0;
}
out_unlock:
+ up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
return err;
}
@@ -959,6 +1053,33 @@ static int move_swap_pte(struct mm_struct *mm,
return 0;
}
+static int move_zeropage_pte(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl)
+{
+ pte_t zero_pte;
+
+ double_pt_lock(dst_ptl, src_ptl);
+ if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
+ !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
+
+ zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ dst_vma->vm_page_prot));
+ ptep_clear_flush(src_vma, src_addr, src_pte);
+ set_pte_at(mm, dst_addr, dst_pte, zero_pte);
+ double_pt_unlock(dst_ptl, src_ptl);
+
+ return 0;
+}
+
+
/*
* The mmap_lock for reading is held by the caller. Just move the page
* from src_pmd to dst_pmd if possible, and return true if succeeded
@@ -1041,6 +1162,14 @@ retry:
}
if (pte_present(orig_src_pte)) {
+ if (is_zero_pfn(pte_pfn(orig_src_pte))) {
+ err = move_zeropage_pte(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte,
+ dst_ptl, src_ptl);
+ goto out;
+ }
+
/*
* Pin and lock both source folio and anon_vma. Since we are in
* RCU read section, we can't block, so on contention have to
@@ -1224,27 +1353,136 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
return -EINVAL;
+ return 0;
+}
+
+static __always_inline
+int find_vmas_mm_locked(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ vma = find_vma_and_prepare_anon(mm, dst_start);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ *dst_vmap = vma;
+ /* Skip finding src_vma if src_start is in dst_vma */
+ if (src_start >= vma->vm_start && src_start < vma->vm_end)
+ goto out_success;
+
+ vma = vma_lookup(mm, src_start);
+ if (!vma)
+ return -ENOENT;
+out_success:
+ *src_vmap = vma;
+ return 0;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static int uffd_move_lock(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ struct vm_area_struct *vma;
+ int err;
+
+ vma = lock_vma(mm, dst_start);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ *dst_vmap = vma;
/*
- * Ensure the dst_vma has a anon_vma or this page
- * would get a NULL anon_vma when moved in the
- * dst_vma.
+ * Skip finding src_vma if src_start is in dst_vma. This also ensures
+ * that we don't lock the same vma twice.
*/
- if (unlikely(anon_vma_prepare(dst_vma)))
- return -ENOMEM;
+ if (src_start >= vma->vm_start && src_start < vma->vm_end) {
+ *src_vmap = vma;
+ return 0;
+ }
- return 0;
+ /*
+ * Using lock_vma() to get src_vma can lead to following deadlock:
+ *
+ * Thread1 Thread2
+ * ------- -------
+ * vma_start_read(dst_vma)
+ * mmap_write_lock(mm)
+ * vma_start_write(src_vma)
+ * vma_start_read(src_vma)
+ * mmap_read_lock(mm)
+ * vma_start_write(dst_vma)
+ */
+ *src_vmap = lock_vma_under_rcu(mm, src_start);
+ if (likely(*src_vmap))
+ return 0;
+
+ /* Undo any locking and retry in mmap_lock critical section */
+ vma_end_read(*dst_vmap);
+
+ mmap_read_lock(mm);
+ err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+ if (!err) {
+ /*
+ * See comment in lock_vma() as to why not using
+ * vma_start_read() here.
+ */
+ down_read(&(*dst_vmap)->vm_lock->lock);
+ if (*dst_vmap != *src_vmap)
+ down_read(&(*src_vmap)->vm_lock->lock);
+ }
+ mmap_read_unlock(mm);
+ return err;
+}
+
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
+{
+ vma_end_read(src_vma);
+ if (src_vma != dst_vma)
+ vma_end_read(dst_vma);
+}
+
+#else
+
+static int uffd_move_lock(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ int err;
+
+ mmap_read_lock(mm);
+ err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+ if (err)
+ mmap_read_unlock(mm);
+ return err;
}
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
+{
+ mmap_assert_locked(src_vma->vm_mm);
+ mmap_read_unlock(dst_vma->vm_mm);
+}
+#endif
+
/**
* move_pages - move arbitrary anonymous pages of an existing vma
* @ctx: pointer to the userfaultfd context
- * @mm: the address space to move pages
* @dst_start: start of the destination virtual memory range
* @src_start: start of the source virtual memory range
* @len: length of the virtual memory range
* @mode: flags from uffdio_move.mode
*
- * Must be called with mmap_lock held for read.
+ * It will either use the mmap_lock in read mode or per-vma locks
*
* move_pages() remaps arbitrary anonymous pages atomically in zero
* copy. It only works on non shared anonymous pages because those can
@@ -1312,10 +1550,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
* could be obtained. This is the only additional complexity added to
* the rmap code to provide this anonymous page remapping functionality.
*/
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
- unsigned long dst_start, unsigned long src_start,
- unsigned long len, __u64 mode)
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long src_start, unsigned long len, __u64 mode)
{
+ struct mm_struct *mm = ctx->mm;
struct vm_area_struct *src_vma, *dst_vma;
unsigned long src_addr, dst_addr;
pmd_t *src_pmd, *dst_pmd;
@@ -1333,28 +1571,34 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
WARN_ON_ONCE(dst_start + len <= dst_start))
goto out;
+ err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
+ if (err)
+ goto out;
+
+ /* Re-check after taking map_changing_lock */
+ err = -EAGAIN;
+ down_read(&ctx->map_changing_lock);
+ if (likely(atomic_read(&ctx->mmap_changing)))
+ goto out_unlock;
/*
* Make sure the vma is not shared, that the src and dst remap
* ranges are both valid and fully within a single existing
* vma.
*/
- src_vma = find_vma(mm, src_start);
- if (!src_vma || (src_vma->vm_flags & VM_SHARED))
- goto out;
- if (src_start < src_vma->vm_start ||
- src_start + len > src_vma->vm_end)
- goto out;
+ err = -EINVAL;
+ if (src_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
+ if (src_start + len > src_vma->vm_end)
+ goto out_unlock;
- dst_vma = find_vma(mm, dst_start);
- if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
- goto out;
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
- goto out;
+ if (dst_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
+ if (dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
err = validate_move_areas(ctx, src_vma, dst_vma);
if (err)
- goto out;
+ goto out_unlock;
for (src_addr = src_start, dst_addr = dst_start;
src_addr < src_start + len;) {
@@ -1404,19 +1648,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
err = -ENOENT;
break;
}
- /* Avoid moving zeropages for now */
- if (is_huge_zero_pmd(*src_pmd)) {
- spin_unlock(ptl);
- err = -EBUSY;
- break;
- }
/* Check if we can move the pmd without splitting it. */
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
!pmd_none(dst_pmdval)) {
struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
- if (!folio || !PageAnonExclusive(&folio->page)) {
+ if (!folio || (!is_huge_zero_page(&folio->page) &&
+ !PageAnonExclusive(&folio->page))) {
spin_unlock(ptl);
err = -EBUSY;
break;
@@ -1476,6 +1715,9 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
moved += step_size;
}
+out_unlock:
+ up_read(&ctx->map_changing_lock);
+ uffd_move_unlock(dst_vma, src_vma);
out:
VM_WARN_ON(moved < 0);
VM_WARN_ON(err > 0);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f9c854ce6cc..198d623054c5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1744,17 +1744,17 @@ bool folio_isolate_lru(struct folio *folio)
* the LRU list will go small and be scanned faster than necessary, leading to
* unnecessary swapping, thrashing and OOM.
*/
-static int too_many_isolated(struct pglist_data *pgdat, int file,
+static bool too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
bool too_many;
if (current_is_kswapd())
- return 0;
+ return false;
if (!writeback_throttling_sane(sc))
- return 0;
+ return false;
if (file) {
inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
@@ -1998,7 +1998,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_inactive);
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
- int file = is_file_lru(lru);
+ bool file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
@@ -2412,7 +2412,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
+ bool file = is_file_lru(lru);
unsigned long lruvec_size;
unsigned long low, min;
unsigned long scan;
@@ -2879,38 +2879,37 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
#endif
-static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
{
int i;
int hist;
+ struct lruvec *lruvec = walk->lruvec;
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
- if (walk) {
- hist = lru_hist_from_seq(walk->max_seq);
+ hist = lru_hist_from_seq(walk->seq);
- for (i = 0; i < NR_MM_STATS; i++) {
- WRITE_ONCE(mm_state->stats[hist][i],
- mm_state->stats[hist][i] + walk->mm_stats[i]);
- walk->mm_stats[i] = 0;
- }
+ for (i = 0; i < NR_MM_STATS; i++) {
+ WRITE_ONCE(mm_state->stats[hist][i],
+ mm_state->stats[hist][i] + walk->mm_stats[i]);
+ walk->mm_stats[i] = 0;
}
if (NR_HIST_GENS > 1 && last) {
- hist = lru_hist_from_seq(mm_state->seq + 1);
+ hist = lru_hist_from_seq(walk->seq + 1);
for (i = 0; i < NR_MM_STATS; i++)
WRITE_ONCE(mm_state->stats[hist][i], 0);
}
}
-static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
- struct mm_struct **iter)
+static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter)
{
bool first = false;
bool last = false;
struct mm_struct *mm = NULL;
+ struct lruvec *lruvec = walk->lruvec;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
@@ -2927,9 +2926,9 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
*/
spin_lock(&mm_list->lock);
- VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq);
- if (walk->max_seq <= mm_state->seq)
+ if (walk->seq <= mm_state->seq)
goto done;
if (!mm_state->head)
@@ -2954,12 +2953,12 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
} while (!(mm = get_next_mm(walk)));
done:
if (*iter || last)
- reset_mm_stats(lruvec, walk, last);
+ reset_mm_stats(walk, last);
spin_unlock(&mm_list->lock);
if (mm && first)
- reset_bloom_filter(mm_state, walk->max_seq + 1);
+ reset_bloom_filter(mm_state, walk->seq + 1);
if (*iter)
mmput_async(*iter);
@@ -2969,7 +2968,7 @@ done:
return last;
}
-static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq)
{
bool success = false;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -2978,13 +2977,12 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
spin_lock(&mm_list->lock);
- VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < seq);
- if (max_seq > mm_state->seq) {
+ if (seq > mm_state->seq) {
mm_state->head = NULL;
mm_state->tail = NULL;
WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
- reset_mm_stats(lruvec, NULL, true);
success = true;
}
@@ -3159,9 +3157,10 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
walk->nr_pages[new_gen][type][zone] += delta;
}
-static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+static void reset_batch_size(struct lru_gen_mm_walk *walk)
{
int gen, type, zone;
+ struct lruvec *lruvec = walk->lruvec;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
walk->batched = 0;
@@ -3331,7 +3330,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+ DEFINE_MAX_SEQ(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
if (!pte)
@@ -3398,7 +3398,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+ DEFINE_MAX_SEQ(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3529,7 +3530,7 @@ restart:
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
}
- if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i))
+ if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i))
continue;
walk->mm_stats[MM_NONLEAF_FOUND]++;
@@ -3540,7 +3541,7 @@ restart:
walk->mm_stats[MM_NONLEAF_ADDED]++;
/* carry over to the next generation */
- update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i);
+ update_bloom_filter(mm_state, walk->seq + 1, pmd + i);
}
walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
@@ -3591,7 +3592,7 @@ done:
return -EAGAIN;
}
-static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
{
static const struct mm_walk_ops mm_walk_ops = {
.test_walk = should_skip_vma,
@@ -3600,6 +3601,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
};
int err;
+ struct lruvec *lruvec = walk->lruvec;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
walk->next_addr = FIRST_USER_ADDRESS;
@@ -3610,7 +3612,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
err = -EBUSY;
/* another thread might have called inc_max_seq() */
- if (walk->max_seq != max_seq)
+ if (walk->seq != max_seq)
break;
/* folio_update_gen() requires stable folio_memcg() */
@@ -3628,7 +3630,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
if (walk->batched) {
spin_lock_irq(&lruvec->lru_lock);
- reset_batch_size(lruvec, walk);
+ reset_batch_size(walk);
spin_unlock_irq(&lruvec->lru_lock);
}
@@ -3747,7 +3749,7 @@ next:
return success;
}
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
bool can_swap, bool force_scan)
{
bool success;
@@ -3755,14 +3757,14 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
int type, zone;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
restart:
- if (max_seq < READ_ONCE(lrugen->max_seq))
+ if (seq < READ_ONCE(lrugen->max_seq))
return false;
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
- success = max_seq == lrugen->max_seq;
+ success = seq == lrugen->max_seq;
if (!success)
goto unlock;
@@ -3815,8 +3817,8 @@ unlock:
return success;
}
-static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap, bool force_scan)
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
+ bool can_swap, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -3824,13 +3826,13 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
- VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+ VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
if (!mm_state)
- return inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+ return inc_max_seq(lruvec, seq, can_swap, force_scan);
/* see the comment in iterate_mm_list() */
- if (max_seq <= READ_ONCE(mm_state->seq))
+ if (seq <= READ_ONCE(mm_state->seq))
return false;
/*
@@ -3840,29 +3842,29 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* is less efficient, but it avoids bursty page faults.
*/
if (!should_walk_mmu()) {
- success = iterate_mm_list_nowalk(lruvec, max_seq);
+ success = iterate_mm_list_nowalk(lruvec, seq);
goto done;
}
walk = set_mm_walk(NULL, true);
if (!walk) {
- success = iterate_mm_list_nowalk(lruvec, max_seq);
+ success = iterate_mm_list_nowalk(lruvec, seq);
goto done;
}
walk->lruvec = lruvec;
- walk->max_seq = max_seq;
+ walk->seq = seq;
walk->can_swap = can_swap;
walk->force_scan = force_scan;
do {
- success = iterate_mm_list(lruvec, walk, &mm);
+ success = iterate_mm_list(walk, &mm);
if (mm)
- walk_mm(lruvec, mm, walk);
+ walk_mm(mm, walk);
} while (mm);
done:
if (success) {
- success = inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+ success = inc_max_seq(lruvec, seq, can_swap, force_scan);
WARN_ON_ONCE(!success);
}
@@ -4287,7 +4289,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
{
bool success;
- /* swapping inhibited */
+ /* swap constrained */
if (!(sc->gfp_mask & __GFP_IO) &&
(folio_test_dirty(folio) ||
(folio_test_anon(folio) && !folio_test_swapcache(folio))))
@@ -4456,9 +4458,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
DEFINE_MIN_SEQ(lruvec);
/*
- * Try to make the obvious choice first. When anon and file are both
- * available from the same generation, interpret swappiness 1 as file
- * first and 200 as anon first.
+ * Try to make the obvious choice first, and if anon and file are both
+ * available from the same generation,
+ * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
+ * first.
+ * 2. If !__GFP_IO, file first since clean pagecache is more likely to
+ * exist than clean swapcache.
*/
if (!swappiness)
type = LRU_GEN_FILE;
@@ -4468,6 +4473,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
type = LRU_GEN_FILE;
else if (swappiness == 200)
type = LRU_GEN_ANON;
+ else if (!(sc->gfp_mask & __GFP_IO))
+ type = LRU_GEN_FILE;
else
type = get_type_to_scan(lruvec, swappiness, &tier);
@@ -4558,8 +4565,10 @@ retry:
move_folios_to_lru(lruvec, &list);
walk = current->reclaim_state->mm_walk;
- if (walk && walk->batched)
- reset_batch_size(lruvec, walk);
+ if (walk && walk->batched) {
+ walk->lruvec = lruvec;
+ reset_batch_size(walk);
+ }
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
@@ -4584,14 +4593,13 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+ bool can_swap, unsigned long *nr_to_scan)
{
int gen, type, zone;
unsigned long old = 0;
unsigned long young = 0;
unsigned long total = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
/* whether this lruvec is completely out of cold folios */
@@ -4619,13 +4627,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
}
}
- /* try to scrape all its memory if this memcg was deleted */
- if (!mem_cgroup_online(memcg)) {
- *nr_to_scan = total;
- return false;
- }
-
- *nr_to_scan = total >> sc->priority;
+ *nr_to_scan = total;
/*
* The aging tries to be lazy to reduce the overhead, while the eviction
@@ -4657,6 +4659,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
*/
static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
{
+ bool success;
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -4664,15 +4667,18 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return -1;
- if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
- return nr_to_scan;
+ success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
- /* skip the aging path at the default priority */
- if (sc->priority == DEF_PRIORITY)
+ /* try to scrape all its memory if this memcg was deleted */
+ if (nr_to_scan && !mem_cgroup_online(memcg))
return nr_to_scan;
- /* skip this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+ /* try to get away with not aging at the default priority */
+ if (!success || sc->priority == DEF_PRIORITY)
+ return nr_to_scan >> sc->priority;
+
+ /* stop scanning this lruvec as it's low on cold folios */
+ return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -4712,10 +4718,6 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
unsigned long scanned = 0;
int swappiness = get_swappiness(lruvec, sc);
- /* clean file folios are more likely to exist */
- if (swappiness && !(sc->gfp_mask & __GFP_IO))
- swappiness = 1;
-
while (true) {
int delta;
@@ -4878,7 +4880,6 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
{
int priority;
unsigned long reclaimable;
- struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
@@ -4888,7 +4889,7 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
* where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_swappiness(lruvec, sc))
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
/* round down reclaimable and round up sc->nr_to_reclaim */
@@ -5332,7 +5333,7 @@ static const struct seq_operations lru_gen_seq_ops = {
.show = lru_gen_seq_show,
};
-static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+static int run_aging(struct lruvec *lruvec, unsigned long seq,
bool can_swap, bool force_scan)
{
DEFINE_MAX_SEQ(lruvec);
@@ -5347,7 +5348,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_contr
if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
return -ERANGE;
- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+ try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
return 0;
}
@@ -5415,7 +5416,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
switch (cmd) {
case '+':
- err = run_aging(lruvec, seq, sc, swappiness, opt);
+ err = run_aging(lruvec, seq, swappiness, opt);
break;
case '-':
err = run_eviction(lruvec, seq, sc, swappiness, opt);
@@ -6796,6 +6797,7 @@ restart:
bool raise_priority = true;
bool balanced;
bool ret;
+ bool was_frozen;
sc.reclaim_idx = highest_zoneidx;
@@ -6894,9 +6896,9 @@ restart:
/* Check if kswapd should be suspending */
__fs_reclaim_release(_THIS_IP_);
- ret = try_to_freeze();
+ ret = kthread_freezable_should_stop(&was_frozen);
__fs_reclaim_acquire(_THIS_IP_);
- if (ret || kthread_should_stop())
+ if (was_frozen || ret)
break;
/*
@@ -7102,7 +7104,7 @@ static int kswapd(void *p)
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
atomic_set(&pgdat->nr_writeback_throttled, 0);
for ( ; ; ) {
- bool ret;
+ bool was_frozen;
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
@@ -7119,15 +7121,14 @@ kswapd_try_sleep:
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
- ret = try_to_freeze();
- if (kthread_should_stop())
+ if (kthread_freezable_should_stop(&was_frozen))
break;
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
- if (ret)
+ if (was_frozen)
continue;
/*
diff --git a/mm/zswap.c b/mm/zswap.c
index db4625af65fb..62fe307521c9 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -71,8 +71,6 @@ static u64 zswap_reject_compress_poor;
static u64 zswap_reject_alloc_fail;
/* Store failed because the entry metadata could not be allocated (rare) */
static u64 zswap_reject_kmemcache_fail;
-/* Duplicate store was encountered (rare) */
-static u64 zswap_duplicate_entry;
/* Shrinker work queue */
static struct workqueue_struct *shrink_wq;
@@ -141,10 +139,6 @@ static bool zswap_non_same_filled_pages_enabled = true;
module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
bool, 0644);
-static bool zswap_exclusive_loads_enabled = IS_ENABLED(
- CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
-module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
-
/* Number of zpools in zswap_pool (empirically determined for scalability) */
#define ZSWAP_NR_ZPOOLS 32
@@ -199,12 +193,6 @@ struct zswap_pool {
*
* rbnode - links the entry into red-black tree for the appropriate swap type
* swpentry - associated swap entry, the offset indexes into the red-black tree
- * refcount - the number of outstanding reference to the entry. This is needed
- * to protect against premature freeing of the entry by code
- * concurrent calls to load, invalidate, and writeback. The lock
- * for the zswap_tree structure that contains the entry must
- * be held while changing the refcount. Since the lock must
- * be held, there is no reason to also make refcount atomic.
* length - the length in bytes of the compressed page data. Needed during
* decompression. For a same value filled page length is 0, and both
* pool and lru are invalid and must be ignored.
@@ -217,7 +205,6 @@ struct zswap_pool {
struct zswap_entry {
struct rb_node rbnode;
swp_entry_t swpentry;
- int refcount;
unsigned int length;
struct zswap_pool *pool;
union {
@@ -228,17 +215,13 @@ struct zswap_entry {
struct list_head lru;
};
-/*
- * The tree lock in the zswap_tree struct protects a few things:
- * - the rbtree
- * - the refcount field of each entry in the tree
- */
struct zswap_tree {
struct rb_root rbroot;
spinlock_t lock;
};
static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+static unsigned int nr_zswap_trees[MAX_SWAPFILES];
/* RCU-protected iteration */
static LIST_HEAD(zswap_pools);
@@ -265,15 +248,16 @@ static bool zswap_has_pool;
* helpers and fwd declarations
**********************************/
+static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
+{
+ return &zswap_trees[swp_type(swp)][swp_offset(swp)
+ >> SWAP_ADDRESS_SPACE_SHIFT];
+}
+
#define zswap_pool_debug(msg, p) \
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpools[0]))
-static int zswap_writeback_entry(struct zswap_entry *entry,
- struct zswap_tree *tree);
-static int zswap_pool_get(struct zswap_pool *pool);
-static void zswap_pool_put(struct zswap_pool *pool);
-
static bool zswap_is_full(void)
{
return totalram_pages() * zswap_max_pool_percent / 100 <
@@ -313,702 +297,12 @@ static void zswap_update_total_size(void)
zswap_pool_total_size = total;
}
-/* should be called under RCU */
-#ifdef CONFIG_MEMCG
-static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
-{
- return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
-}
-#else
-static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
-{
- return NULL;
-}
-#endif
-
-static inline int entry_to_nid(struct zswap_entry *entry)
-{
- return page_to_nid(virt_to_page(entry));
-}
-
-void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
-{
- struct zswap_pool *pool;
-
- /* lock out zswap pools list modification */
- spin_lock(&zswap_pools_lock);
- list_for_each_entry(pool, &zswap_pools, list) {
- if (pool->next_shrink == memcg)
- pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
- }
- spin_unlock(&zswap_pools_lock);
-}
-
-/*********************************
-* zswap entry functions
-**********************************/
-static struct kmem_cache *zswap_entry_cache;
-
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
-{
- struct zswap_entry *entry;
- entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
- if (!entry)
- return NULL;
- entry->refcount = 1;
- RB_CLEAR_NODE(&entry->rbnode);
- return entry;
-}
-
-static void zswap_entry_cache_free(struct zswap_entry *entry)
-{
- kmem_cache_free(zswap_entry_cache, entry);
-}
-
-/*********************************
-* zswap lruvec functions
-**********************************/
-void zswap_lruvec_state_init(struct lruvec *lruvec)
-{
- atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
-}
-
-void zswap_folio_swapin(struct folio *folio)
-{
- struct lruvec *lruvec;
-
- VM_WARN_ON_ONCE(!folio_test_locked(folio));
- lruvec = folio_lruvec(folio);
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-}
-
-/*********************************
-* lru functions
-**********************************/
-static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
-{
- atomic_long_t *nr_zswap_protected;
- unsigned long lru_size, old, new;
- int nid = entry_to_nid(entry);
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- /*
- * Note that it is safe to use rcu_read_lock() here, even in the face of
- * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
- * used in list_lru lookup, only two scenarios are possible:
- *
- * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
- * new entry will be reparented to memcg's parent's list_lru.
- * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
- * new entry will be added directly to memcg's parent's list_lru.
- *
- * Similar reasoning holds for list_lru_del() and list_lru_putback().
- */
- rcu_read_lock();
- memcg = mem_cgroup_from_entry(entry);
- /* will always succeed */
- list_lru_add(list_lru, &entry->lru, nid, memcg);
-
- /* Update the protection area */
- lru_size = list_lru_count_one(list_lru, nid, memcg);
- lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
- old = atomic_long_inc_return(nr_zswap_protected);
- /*
- * Decay to avoid overflow and adapt to changing workloads.
- * This is based on LRU reclaim cost decaying heuristics.
- */
- do {
- new = old > lru_size / 4 ? old / 2 : old;
- } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
- rcu_read_unlock();
-}
-
-static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
-{
- int nid = entry_to_nid(entry);
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_entry(entry);
- /* will always succeed */
- list_lru_del(list_lru, &entry->lru, nid, memcg);
- rcu_read_unlock();
-}
-
-static void zswap_lru_putback(struct list_lru *list_lru,
- struct zswap_entry *entry)
-{
- int nid = entry_to_nid(entry);
- spinlock_t *lock = &list_lru->node[nid].lock;
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_entry(entry);
- spin_lock(lock);
- /* we cannot use list_lru_add here, because it increments node's lru count */
- list_lru_putback(list_lru, &entry->lru, nid, memcg);
- spin_unlock(lock);
-
- lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry)));
- /* increment the protection area to account for the LRU rotation. */
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- rcu_read_unlock();
-}
-
-/*********************************
-* rbtree functions
-**********************************/
-static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
-{
- struct rb_node *node = root->rb_node;
- struct zswap_entry *entry;
- pgoff_t entry_offset;
-
- while (node) {
- entry = rb_entry(node, struct zswap_entry, rbnode);
- entry_offset = swp_offset(entry->swpentry);
- if (entry_offset > offset)
- node = node->rb_left;
- else if (entry_offset < offset)
- node = node->rb_right;
- else
- return entry;
- }
- return NULL;
-}
-
-/*
- * In the case that a entry with the same offset is found, a pointer to
- * the existing entry is stored in dupentry and the function returns -EEXIST
- */
-static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
- struct zswap_entry **dupentry)
-{
- struct rb_node **link = &root->rb_node, *parent = NULL;
- struct zswap_entry *myentry;
- pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
-
- while (*link) {
- parent = *link;
- myentry = rb_entry(parent, struct zswap_entry, rbnode);
- myentry_offset = swp_offset(myentry->swpentry);
- if (myentry_offset > entry_offset)
- link = &(*link)->rb_left;
- else if (myentry_offset < entry_offset)
- link = &(*link)->rb_right;
- else {
- *dupentry = myentry;
- return -EEXIST;
- }
- }
- rb_link_node(&entry->rbnode, parent, link);
- rb_insert_color(&entry->rbnode, root);
- return 0;
-}
-
-static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
-{
- if (!RB_EMPTY_NODE(&entry->rbnode)) {
- rb_erase(&entry->rbnode, root);
- RB_CLEAR_NODE(&entry->rbnode);
- return true;
- }
- return false;
-}
-
-static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
-{
- int i = 0;
-
- if (ZSWAP_NR_ZPOOLS > 1)
- i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
-
- return entry->pool->zpools[i];
-}
-
-/*
- * Carries out the common pattern of freeing and entry's zpool allocation,
- * freeing the entry itself, and decrementing the number of stored pages.
- */
-static void zswap_free_entry(struct zswap_entry *entry)
-{
- if (!entry->length)
- atomic_dec(&zswap_same_filled_pages);
- else {
- zswap_lru_del(&entry->pool->list_lru, entry);
- zpool_free(zswap_find_zpool(entry), entry->handle);
- atomic_dec(&entry->pool->nr_stored);
- zswap_pool_put(entry->pool);
- }
- if (entry->objcg) {
- obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
- obj_cgroup_put(entry->objcg);
- }
- zswap_entry_cache_free(entry);
- atomic_dec(&zswap_stored_pages);
- zswap_update_total_size();
-}
-
-/* caller must hold the tree lock */
-static void zswap_entry_get(struct zswap_entry *entry)
-{
- entry->refcount++;
-}
-
-/* caller must hold the tree lock
-* remove from the tree and free it, if nobody reference the entry
-*/
-static void zswap_entry_put(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- int refcount = --entry->refcount;
-
- WARN_ON_ONCE(refcount < 0);
- if (refcount == 0) {
- WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
- zswap_free_entry(entry);
- }
-}
-
-/* caller must hold the tree lock */
-static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
- pgoff_t offset)
-{
- struct zswap_entry *entry;
-
- entry = zswap_rb_search(root, offset);
- if (entry)
- zswap_entry_get(entry);
-
- return entry;
-}
-
-/*********************************
-* shrinker functions
-**********************************/
-static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
- spinlock_t *lock, void *arg);
-
-static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
- unsigned long shrink_ret, nr_protected, lru_size;
- struct zswap_pool *pool = shrinker->private_data;
- bool encountered_page_in_swapcache = false;
-
- if (!zswap_shrinker_enabled ||
- !mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
- sc->nr_scanned = 0;
- return SHRINK_STOP;
- }
-
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- lru_size = list_lru_shrink_count(&pool->list_lru, sc);
-
- /*
- * Abort if we are shrinking into the protected region.
- *
- * This short-circuiting is necessary because if we have too many multiple
- * concurrent reclaimers getting the freeable zswap object counts at the
- * same time (before any of them made reasonable progress), the total
- * number of reclaimed objects might be more than the number of unprotected
- * objects (i.e the reclaimers will reclaim into the protected area of the
- * zswap LRU).
- */
- if (nr_protected >= lru_size - sc->nr_to_scan) {
- sc->nr_scanned = 0;
- return SHRINK_STOP;
- }
-
- shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
- &encountered_page_in_swapcache);
-
- if (encountered_page_in_swapcache)
- return SHRINK_STOP;
-
- return shrink_ret ? shrink_ret : SHRINK_STOP;
-}
-
-static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- struct zswap_pool *pool = shrinker->private_data;
- struct mem_cgroup *memcg = sc->memcg;
- struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
- unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
-
- if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
- return 0;
-
-#ifdef CONFIG_MEMCG_KMEM
- mem_cgroup_flush_stats(memcg);
- nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
- nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
-#else
- /* use pool stats instead of memcg stats */
- nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
- nr_stored = atomic_read(&pool->nr_stored);
-#endif
-
- if (!nr_stored)
- return 0;
-
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
- /*
- * Subtract the lru size by an estimate of the number of pages
- * that should be protected.
- */
- nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
-
- /*
- * Scale the number of freeable pages by the memory saving factor.
- * This ensures that the better zswap compresses memory, the fewer
- * pages we will evict to swap (as it will otherwise incur IO for
- * relatively small memory saving).
- */
- return mult_frac(nr_freeable, nr_backing, nr_stored);
-}
-
-static void zswap_alloc_shrinker(struct zswap_pool *pool)
-{
- pool->shrinker =
- shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
- if (!pool->shrinker)
- return;
-
- pool->shrinker->private_data = pool;
- pool->shrinker->scan_objects = zswap_shrinker_scan;
- pool->shrinker->count_objects = zswap_shrinker_count;
- pool->shrinker->batch = 0;
- pool->shrinker->seeks = DEFAULT_SEEKS;
-}
-
-/*********************************
-* per-cpu code
-**********************************/
-static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
- int ret;
-
- mutex_init(&acomp_ctx->mutex);
-
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
-
- acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
- if (IS_ERR(acomp)) {
- pr_err("could not alloc crypto acomp %s : %ld\n",
- pool->tfm_name, PTR_ERR(acomp));
- ret = PTR_ERR(acomp);
- goto acomp_fail;
- }
- acomp_ctx->acomp = acomp;
-
- req = acomp_request_alloc(acomp_ctx->acomp);
- if (!req) {
- pr_err("could not alloc crypto acomp_request %s\n",
- pool->tfm_name);
- ret = -ENOMEM;
- goto req_fail;
- }
- acomp_ctx->req = req;
-
- crypto_init_wait(&acomp_ctx->wait);
- /*
- * if the backend of acomp is async zip, crypto_req_done() will wakeup
- * crypto_wait_req(); if the backend of acomp is scomp, the callback
- * won't be called, crypto_wait_req() will return without blocking.
- */
- acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &acomp_ctx->wait);
-
- return 0;
-
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
- return ret;
-}
-
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
-
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
-
- return 0;
-}
-
/*********************************
* pool functions
**********************************/
-static struct zswap_pool *__zswap_pool_current(void)
-{
- struct zswap_pool *pool;
-
- pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
- WARN_ONCE(!pool && zswap_has_pool,
- "%s: no page storage pool!\n", __func__);
-
- return pool;
-}
-
-static struct zswap_pool *zswap_pool_current(void)
-{
- assert_spin_locked(&zswap_pools_lock);
-
- return __zswap_pool_current();
-}
-
-static struct zswap_pool *zswap_pool_current_get(void)
-{
- struct zswap_pool *pool;
-
- rcu_read_lock();
-
- pool = __zswap_pool_current();
- if (!zswap_pool_get(pool))
- pool = NULL;
-
- rcu_read_unlock();
-
- return pool;
-}
-
-static struct zswap_pool *zswap_pool_last_get(void)
-{
- struct zswap_pool *pool, *last = NULL;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pool, &zswap_pools, list)
- last = pool;
- WARN_ONCE(!last && zswap_has_pool,
- "%s: no page storage pool!\n", __func__);
- if (!zswap_pool_get(last))
- last = NULL;
-
- rcu_read_unlock();
-
- return last;
-}
-
-/* type and compressor must be null-terminated */
-static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
-{
- struct zswap_pool *pool;
-
- assert_spin_locked(&zswap_pools_lock);
-
- list_for_each_entry_rcu(pool, &zswap_pools, list) {
- if (strcmp(pool->tfm_name, compressor))
- continue;
- /* all zpools share the same type */
- if (strcmp(zpool_get_type(pool->zpools[0]), type))
- continue;
- /* if we can't get it, it's about to be destroyed */
- if (!zswap_pool_get(pool))
- continue;
- return pool;
- }
-
- return NULL;
-}
-
-/*
- * If the entry is still valid in the tree, drop the initial ref and remove it
- * from the tree. This function must be called with an additional ref held,
- * otherwise it may race with another invalidation freeing the entry.
- */
-static void zswap_invalidate_entry(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- if (zswap_rb_erase(&tree->rbroot, entry))
- zswap_entry_put(tree, entry);
-}
-
-static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
- spinlock_t *lock, void *arg)
-{
- struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
- bool *encountered_page_in_swapcache = (bool *)arg;
- struct zswap_tree *tree;
- pgoff_t swpoffset;
- enum lru_status ret = LRU_REMOVED_RETRY;
- int writeback_result;
-
- /*
- * Once the lru lock is dropped, the entry might get freed. The
- * swpoffset is copied to the stack, and entry isn't deref'd again
- * until the entry is verified to still be alive in the tree.
- */
- swpoffset = swp_offset(entry->swpentry);
- tree = zswap_trees[swp_type(entry->swpentry)];
- list_lru_isolate(l, item);
- /*
- * It's safe to drop the lock here because we return either
- * LRU_REMOVED_RETRY or LRU_RETRY.
- */
- spin_unlock(lock);
-
- /* Check for invalidate() race */
- spin_lock(&tree->lock);
- if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
- goto unlock;
-
- /* Hold a reference to prevent a free during writeback */
- zswap_entry_get(entry);
- spin_unlock(&tree->lock);
-
- writeback_result = zswap_writeback_entry(entry, tree);
-
- spin_lock(&tree->lock);
- if (writeback_result) {
- zswap_reject_reclaim_fail++;
- zswap_lru_putback(&entry->pool->list_lru, entry);
- ret = LRU_RETRY;
-
- /*
- * Encountering a page already in swap cache is a sign that we are shrinking
- * into the warmer region. We should terminate shrinking (if we're in the dynamic
- * shrinker context).
- */
- if (writeback_result == -EEXIST && encountered_page_in_swapcache)
- *encountered_page_in_swapcache = true;
-
- goto put_unlock;
- }
- zswap_written_back_pages++;
-
- if (entry->objcg)
- count_objcg_event(entry->objcg, ZSWPWB);
-
- count_vm_event(ZSWPWB);
- /*
- * Writeback started successfully, the page now belongs to the
- * swapcache. Drop the entry from zswap - unless invalidate already
- * took it out while we had the tree->lock released for IO.
- */
- zswap_invalidate_entry(tree, entry);
-
-put_unlock:
- /* Drop local reference */
- zswap_entry_put(tree, entry);
-unlock:
- spin_unlock(&tree->lock);
- spin_lock(lock);
- return ret;
-}
-
-static int shrink_memcg(struct mem_cgroup *memcg)
-{
- struct zswap_pool *pool;
- int nid, shrunk = 0;
-
- if (!mem_cgroup_zswap_writeback_enabled(memcg))
- return -EINVAL;
-
- /*
- * Skip zombies because their LRUs are reparented and we would be
- * reclaiming from the parent instead of the dead memcg.
- */
- if (memcg && !mem_cgroup_online(memcg))
- return -ENOENT;
-
- pool = zswap_pool_current_get();
- if (!pool)
- return -EINVAL;
-
- for_each_node_state(nid, N_NORMAL_MEMORY) {
- unsigned long nr_to_walk = 1;
-
- shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
- &shrink_memcg_cb, NULL, &nr_to_walk);
- }
- zswap_pool_put(pool);
- return shrunk ? 0 : -EAGAIN;
-}
-
-static void shrink_worker(struct work_struct *w)
-{
- struct zswap_pool *pool = container_of(w, typeof(*pool),
- shrink_work);
- struct mem_cgroup *memcg;
- int ret, failures = 0;
-
- /* global reclaim will select cgroup in a round-robin fashion. */
- do {
- spin_lock(&zswap_pools_lock);
- pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
- memcg = pool->next_shrink;
-
- /*
- * We need to retry if we have gone through a full round trip, or if we
- * got an offline memcg (or else we risk undoing the effect of the
- * zswap memcg offlining cleanup callback). This is not catastrophic
- * per se, but it will keep the now offlined memcg hostage for a while.
- *
- * Note that if we got an online memcg, we will keep the extra
- * reference in case the original reference obtained by mem_cgroup_iter
- * is dropped by the zswap memcg offlining callback, ensuring that the
- * memcg is not killed when we are reclaiming.
- */
- if (!memcg) {
- spin_unlock(&zswap_pools_lock);
- if (++failures == MAX_RECLAIM_RETRIES)
- break;
-
- goto resched;
- }
-
- if (!mem_cgroup_tryget_online(memcg)) {
- /* drop the reference from mem_cgroup_iter() */
- mem_cgroup_iter_break(NULL, memcg);
- pool->next_shrink = NULL;
- spin_unlock(&zswap_pools_lock);
-
- if (++failures == MAX_RECLAIM_RETRIES)
- break;
-
- goto resched;
- }
- spin_unlock(&zswap_pools_lock);
-
- ret = shrink_memcg(memcg);
- /* drop the extra reference */
- mem_cgroup_put(memcg);
-
- if (ret == -EINVAL)
- break;
- if (ret && ++failures == MAX_RECLAIM_RETRIES)
- break;
-
-resched:
- cond_resched();
- } while (!zswap_can_accept());
- zswap_pool_put(pool);
-}
+static void zswap_alloc_shrinker(struct zswap_pool *pool);
+static void shrink_worker(struct work_struct *w);
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
@@ -1155,14 +449,6 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
kfree(pool);
}
-static int __must_check zswap_pool_get(struct zswap_pool *pool)
-{
- if (!pool)
- return 0;
-
- return kref_get_unless_zero(&pool->kref);
-}
-
static void __zswap_pool_release(struct work_struct *work)
{
struct zswap_pool *pool = container_of(work, typeof(*pool),
@@ -1177,6 +463,8 @@ static void __zswap_pool_release(struct work_struct *work)
zswap_pool_destroy(pool);
}
+static struct zswap_pool *zswap_pool_current(void);
+
static void __zswap_pool_empty(struct kref *kref)
{
struct zswap_pool *pool;
@@ -1195,11 +483,92 @@ static void __zswap_pool_empty(struct kref *kref)
spin_unlock(&zswap_pools_lock);
}
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+ if (!pool)
+ return 0;
+
+ return kref_get_unless_zero(&pool->kref);
+}
+
static void zswap_pool_put(struct zswap_pool *pool)
{
kref_put(&pool->kref, __zswap_pool_empty);
}
+static struct zswap_pool *__zswap_pool_current(void)
+{
+ struct zswap_pool *pool;
+
+ pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+ WARN_ONCE(!pool && zswap_has_pool,
+ "%s: no page storage pool!\n", __func__);
+
+ return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+ assert_spin_locked(&zswap_pools_lock);
+
+ return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+ struct zswap_pool *pool;
+
+ rcu_read_lock();
+
+ pool = __zswap_pool_current();
+ if (!zswap_pool_get(pool))
+ pool = NULL;
+
+ rcu_read_unlock();
+
+ return pool;
+}
+
+static struct zswap_pool *zswap_pool_last_get(void)
+{
+ struct zswap_pool *pool, *last = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list)
+ last = pool;
+ WARN_ONCE(!last && zswap_has_pool,
+ "%s: no page storage pool!\n", __func__);
+ if (!zswap_pool_get(last))
+ last = NULL;
+
+ rcu_read_unlock();
+
+ return last;
+}
+
+/* type and compressor must be null-terminated */
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+ struct zswap_pool *pool;
+
+ assert_spin_locked(&zswap_pools_lock);
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list) {
+ if (strcmp(pool->tfm_name, compressor))
+ continue;
+ /* all zpools share the same type */
+ if (strcmp(zpool_get_type(pool->zpools[0]), type))
+ continue;
+ /* if we can't get it, it's about to be destroyed */
+ if (!zswap_pool_get(pool))
+ continue;
+ return pool;
+ }
+
+ return NULL;
+}
+
/*********************************
* param callbacks
**********************************/
@@ -1356,7 +725,372 @@ static int zswap_enabled_param_set(const char *val,
return ret;
}
-static void __zswap_load(struct zswap_entry *entry, struct page *page)
+/*********************************
+* lru functions
+**********************************/
+
+/* should be called under RCU */
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+ return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
+}
+#else
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+ return NULL;
+}
+#endif
+
+static inline int entry_to_nid(struct zswap_entry *entry)
+{
+ return page_to_nid(virt_to_page(entry));
+}
+
+static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+ atomic_long_t *nr_zswap_protected;
+ unsigned long lru_size, old, new;
+ int nid = entry_to_nid(entry);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ /*
+ * Note that it is safe to use rcu_read_lock() here, even in the face of
+ * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
+ * used in list_lru lookup, only two scenarios are possible:
+ *
+ * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
+ * new entry will be reparented to memcg's parent's list_lru.
+ * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
+ * new entry will be added directly to memcg's parent's list_lru.
+ *
+ * Similar reasoning holds for list_lru_del().
+ */
+ rcu_read_lock();
+ memcg = mem_cgroup_from_entry(entry);
+ /* will always succeed */
+ list_lru_add(list_lru, &entry->lru, nid, memcg);
+
+ /* Update the protection area */
+ lru_size = list_lru_count_one(list_lru, nid, memcg);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
+ old = atomic_long_inc_return(nr_zswap_protected);
+ /*
+ * Decay to avoid overflow and adapt to changing workloads.
+ * This is based on LRU reclaim cost decaying heuristics.
+ */
+ do {
+ new = old > lru_size / 4 ? old / 2 : old;
+ } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
+ rcu_read_unlock();
+}
+
+static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+ int nid = entry_to_nid(entry);
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_entry(entry);
+ /* will always succeed */
+ list_lru_del(list_lru, &entry->lru, nid, memcg);
+ rcu_read_unlock();
+}
+
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+ atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_folio_swapin(struct folio *folio)
+{
+ struct lruvec *lruvec;
+
+ if (folio) {
+ lruvec = folio_lruvec(folio);
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ }
+}
+
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
+{
+ struct zswap_pool *pool;
+
+ /* lock out zswap pools list modification */
+ spin_lock(&zswap_pools_lock);
+ list_for_each_entry(pool, &zswap_pools, list) {
+ if (pool->next_shrink == memcg)
+ pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+ }
+ spin_unlock(&zswap_pools_lock);
+}
+
+/*********************************
+* rbtree functions
+**********************************/
+static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
+{
+ struct rb_node *node = root->rb_node;
+ struct zswap_entry *entry;
+ pgoff_t entry_offset;
+
+ while (node) {
+ entry = rb_entry(node, struct zswap_entry, rbnode);
+ entry_offset = swp_offset(entry->swpentry);
+ if (entry_offset > offset)
+ node = node->rb_left;
+ else if (entry_offset < offset)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * In the case that a entry with the same offset is found, a pointer to
+ * the existing entry is stored in dupentry and the function returns -EEXIST
+ */
+static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
+ struct zswap_entry **dupentry)
+{
+ struct rb_node **link = &root->rb_node, *parent = NULL;
+ struct zswap_entry *myentry;
+ pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
+
+ while (*link) {
+ parent = *link;
+ myentry = rb_entry(parent, struct zswap_entry, rbnode);
+ myentry_offset = swp_offset(myentry->swpentry);
+ if (myentry_offset > entry_offset)
+ link = &(*link)->rb_left;
+ else if (myentry_offset < entry_offset)
+ link = &(*link)->rb_right;
+ else {
+ *dupentry = myentry;
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&entry->rbnode, parent, link);
+ rb_insert_color(&entry->rbnode, root);
+ return 0;
+}
+
+static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+{
+ rb_erase(&entry->rbnode, root);
+ RB_CLEAR_NODE(&entry->rbnode);
+}
+
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
+{
+ struct zswap_entry *entry;
+ entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
+ if (!entry)
+ return NULL;
+ RB_CLEAR_NODE(&entry->rbnode);
+ return entry;
+}
+
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+ kmem_cache_free(zswap_entry_cache, entry);
+}
+
+static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
+{
+ int i = 0;
+
+ if (ZSWAP_NR_ZPOOLS > 1)
+ i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
+
+ return entry->pool->zpools[i];
+}
+
+/*
+ * Carries out the common pattern of freeing and entry's zpool allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_entry_free(struct zswap_entry *entry)
+{
+ if (!entry->length)
+ atomic_dec(&zswap_same_filled_pages);
+ else {
+ zswap_lru_del(&entry->pool->list_lru, entry);
+ zpool_free(zswap_find_zpool(entry), entry->handle);
+ atomic_dec(&entry->pool->nr_stored);
+ zswap_pool_put(entry->pool);
+ }
+ if (entry->objcg) {
+ obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
+ obj_cgroup_put(entry->objcg);
+ }
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+ zswap_update_total_size();
+}
+
+/*
+ * The caller hold the tree lock and search the entry from the tree,
+ * so it must be on the tree, remove it from the tree and free it.
+ */
+static void zswap_invalidate_entry(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ zswap_rb_erase(&tree->rbroot, entry);
+ zswap_entry_free(entry);
+}
+
+/*********************************
+* compressed storage functions
+**********************************/
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct crypto_acomp *acomp;
+ struct acomp_req *req;
+ int ret;
+
+ mutex_init(&acomp_ctx->mutex);
+
+ acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!acomp_ctx->buffer)
+ return -ENOMEM;
+
+ acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ if (IS_ERR(acomp)) {
+ pr_err("could not alloc crypto acomp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(acomp));
+ ret = PTR_ERR(acomp);
+ goto acomp_fail;
+ }
+ acomp_ctx->acomp = acomp;
+
+ req = acomp_request_alloc(acomp_ctx->acomp);
+ if (!req) {
+ pr_err("could not alloc crypto acomp_request %s\n",
+ pool->tfm_name);
+ ret = -ENOMEM;
+ goto req_fail;
+ }
+ acomp_ctx->req = req;
+
+ crypto_init_wait(&acomp_ctx->wait);
+ /*
+ * if the backend of acomp is async zip, crypto_req_done() will wakeup
+ * crypto_wait_req(); if the backend of acomp is scomp, the callback
+ * won't be called, crypto_wait_req() will return without blocking.
+ */
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &acomp_ctx->wait);
+
+ return 0;
+
+req_fail:
+ crypto_free_acomp(acomp_ctx->acomp);
+acomp_fail:
+ kfree(acomp_ctx->buffer);
+ return ret;
+}
+
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+ if (!IS_ERR_OR_NULL(acomp_ctx)) {
+ if (!IS_ERR_OR_NULL(acomp_ctx->req))
+ acomp_request_free(acomp_ctx->req);
+ if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+ crypto_free_acomp(acomp_ctx->acomp);
+ kfree(acomp_ctx->buffer);
+ }
+
+ return 0;
+}
+
+static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+ struct scatterlist input, output;
+ unsigned int dlen = PAGE_SIZE;
+ unsigned long handle;
+ struct zpool *zpool;
+ char *buf;
+ gfp_t gfp;
+ int ret;
+ u8 *dst;
+
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
+ mutex_lock(&acomp_ctx->mutex);
+
+ dst = acomp_ctx->buffer;
+ sg_init_table(&input, 1);
+ sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
+
+ /*
+ * We need PAGE_SIZE * 2 here since there maybe over-compression case,
+ * and hardware-accelerators may won't check the dst buffer size, so
+ * giving the dst buffer with enough length to avoid buffer overflow.
+ */
+ sg_init_one(&output, dst, PAGE_SIZE * 2);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+
+ /*
+ * it maybe looks a little bit silly that we send an asynchronous request,
+ * then wait for its completion synchronously. This makes the process look
+ * synchronous in fact.
+ * Theoretically, acomp supports users send multiple acomp requests in one
+ * acomp instance, then get those requests done simultaneously. but in this
+ * case, zswap actually does store and load page by page, there is no
+ * existing method to send the second page before the first page is done
+ * in one thread doing zwap.
+ * but in different threads running on different cpu, we have different
+ * acomp instance, so multiple threads can do (de)compression in parallel.
+ */
+ ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+ if (ret) {
+ zswap_reject_compress_fail++;
+ goto unlock;
+ }
+
+ zpool = zswap_find_zpool(entry);
+ gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ if (zpool_malloc_support_movable(zpool))
+ gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+ ret = zpool_malloc(zpool, dlen, gfp, &handle);
+ if (ret == -ENOSPC) {
+ zswap_reject_compress_poor++;
+ goto unlock;
+ }
+ if (ret) {
+ zswap_reject_alloc_fail++;
+ goto unlock;
+ }
+
+ buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+ memcpy(buf, dst, dlen);
+ zpool_unmap_handle(zpool, handle);
+
+ entry->handle = handle;
+ entry->length = dlen;
+
+unlock:
+ mutex_unlock(&acomp_ctx->mutex);
+ return ret == 0;
+}
+
+static void zswap_decompress(struct zswap_entry *entry, struct page *page)
{
struct zpool *zpool = zswap_find_zpool(entry);
struct scatterlist input, output;
@@ -1401,9 +1135,9 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
* freed.
*/
static int zswap_writeback_entry(struct zswap_entry *entry,
- struct zswap_tree *tree)
+ swp_entry_t swpentry)
{
- swp_entry_t swpentry = entry->swpentry;
+ struct zswap_tree *tree;
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
@@ -1419,9 +1153,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
return -ENOMEM;
/*
- * Found an existing folio, we raced with load/swapin. We generally
- * writeback cold folios from zswap, and swapin means the folio just
- * became hot. Skip this folio and let the caller find another one.
+ * Found an existing folio, we raced with swapin or concurrent
+ * shrinker. We generally writeback cold folios from zswap, and
+ * swapin means the folio just became hot, so skip this folio.
+ * For unlikely concurrent shrinker case, it will be unlinked
+ * and freed when invalidated by the concurrent shrinker anyway.
*/
if (!folio_was_allocated) {
folio_put(folio);
@@ -1430,22 +1166,34 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/*
* folio is locked, and the swapcache is now secured against
- * concurrent swapping to and from the slot. Verify that the
- * swap entry hasn't been invalidated and recycled behind our
- * backs (our zswap_entry reference doesn't prevent that), to
- * avoid overwriting a new swap folio with old compressed data.
+ * concurrent swapping to and from the slot, and concurrent
+ * swapoff so we can safely dereference the zswap tree here.
+ * Verify that the swap entry hasn't been invalidated and recycled
+ * behind our backs, to avoid overwriting a new swap folio with
+ * old compressed data. Only when this is successful can the entry
+ * be dereferenced.
*/
+ tree = swap_zswap_tree(swpentry);
spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
+ if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
folio_unlock(folio);
folio_put(folio);
return -ENOMEM;
}
+
+ /* Safe to deref entry after the entry is verified above. */
+ zswap_rb_erase(&tree->rbroot, entry);
spin_unlock(&tree->lock);
- __zswap_load(entry, &folio->page);
+ zswap_decompress(entry, &folio->page);
+
+ count_vm_event(ZSWPWB);
+ if (entry->objcg)
+ count_objcg_event(entry->objcg, ZSWPWB);
+
+ zswap_entry_free(entry);
/* folio is up to date */
folio_mark_uptodate(folio);
@@ -1460,6 +1208,268 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
return 0;
}
+/*********************************
+* shrinker functions
+**********************************/
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+ spinlock_t *lock, void *arg)
+{
+ struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+ bool *encountered_page_in_swapcache = (bool *)arg;
+ swp_entry_t swpentry;
+ enum lru_status ret = LRU_REMOVED_RETRY;
+ int writeback_result;
+
+ /*
+ * As soon as we drop the LRU lock, the entry can be freed by
+ * a concurrent invalidation. This means the following:
+ *
+ * 1. We extract the swp_entry_t to the stack, allowing
+ * zswap_writeback_entry() to pin the swap entry and
+ * then validate the zwap entry against that swap entry's
+ * tree using pointer value comparison. Only when that
+ * is successful can the entry be dereferenced.
+ *
+ * 2. Usually, objects are taken off the LRU for reclaim. In
+ * this case this isn't possible, because if reclaim fails
+ * for whatever reason, we have no means of knowing if the
+ * entry is alive to put it back on the LRU.
+ *
+ * So rotate it before dropping the lock. If the entry is
+ * written back or invalidated, the free path will unlink
+ * it. For failures, rotation is the right thing as well.
+ *
+ * Temporary failures, where the same entry should be tried
+ * again immediately, almost never happen for this shrinker.
+ * We don't do any trylocking; -ENOMEM comes closest,
+ * but that's extremely rare and doesn't happen spuriously
+ * either. Don't bother distinguishing this case.
+ */
+ list_move_tail(item, &l->list);
+
+ /*
+ * Once the lru lock is dropped, the entry might get freed. The
+ * swpentry is copied to the stack, and entry isn't deref'd again
+ * until the entry is verified to still be alive in the tree.
+ */
+ swpentry = entry->swpentry;
+
+ /*
+ * It's safe to drop the lock here because we return either
+ * LRU_REMOVED_RETRY or LRU_RETRY.
+ */
+ spin_unlock(lock);
+
+ writeback_result = zswap_writeback_entry(entry, swpentry);
+
+ if (writeback_result) {
+ zswap_reject_reclaim_fail++;
+ ret = LRU_RETRY;
+
+ /*
+ * Encountering a page already in swap cache is a sign that we are shrinking
+ * into the warmer region. We should terminate shrinking (if we're in the dynamic
+ * shrinker context).
+ */
+ if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+ ret = LRU_STOP;
+ *encountered_page_in_swapcache = true;
+ }
+ } else {
+ zswap_written_back_pages++;
+ }
+
+ spin_lock(lock);
+ return ret;
+}
+
+static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+ unsigned long shrink_ret, nr_protected, lru_size;
+ struct zswap_pool *pool = shrinker->private_data;
+ bool encountered_page_in_swapcache = false;
+
+ if (!zswap_shrinker_enabled ||
+ !mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
+ sc->nr_scanned = 0;
+ return SHRINK_STOP;
+ }
+
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ lru_size = list_lru_shrink_count(&pool->list_lru, sc);
+
+ /*
+ * Abort if we are shrinking into the protected region.
+ *
+ * This short-circuiting is necessary because if we have too many multiple
+ * concurrent reclaimers getting the freeable zswap object counts at the
+ * same time (before any of them made reasonable progress), the total
+ * number of reclaimed objects might be more than the number of unprotected
+ * objects (i.e the reclaimers will reclaim into the protected area of the
+ * zswap LRU).
+ */
+ if (nr_protected >= lru_size - sc->nr_to_scan) {
+ sc->nr_scanned = 0;
+ return SHRINK_STOP;
+ }
+
+ shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
+ &encountered_page_in_swapcache);
+
+ if (encountered_page_in_swapcache)
+ return SHRINK_STOP;
+
+ return shrink_ret ? shrink_ret : SHRINK_STOP;
+}
+
+static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct zswap_pool *pool = shrinker->private_data;
+ struct mem_cgroup *memcg = sc->memcg;
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
+ unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+
+ if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
+ return 0;
+
+#ifdef CONFIG_MEMCG_KMEM
+ mem_cgroup_flush_stats(memcg);
+ nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
+ nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
+#else
+ /* use pool stats instead of memcg stats */
+ nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
+ nr_stored = atomic_read(&pool->nr_stored);
+#endif
+
+ if (!nr_stored)
+ return 0;
+
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
+ /*
+ * Subtract the lru size by an estimate of the number of pages
+ * that should be protected.
+ */
+ nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+
+ /*
+ * Scale the number of freeable pages by the memory saving factor.
+ * This ensures that the better zswap compresses memory, the fewer
+ * pages we will evict to swap (as it will otherwise incur IO for
+ * relatively small memory saving).
+ */
+ return mult_frac(nr_freeable, nr_backing, nr_stored);
+}
+
+static void zswap_alloc_shrinker(struct zswap_pool *pool)
+{
+ pool->shrinker =
+ shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
+ if (!pool->shrinker)
+ return;
+
+ pool->shrinker->private_data = pool;
+ pool->shrinker->scan_objects = zswap_shrinker_scan;
+ pool->shrinker->count_objects = zswap_shrinker_count;
+ pool->shrinker->batch = 0;
+ pool->shrinker->seeks = DEFAULT_SEEKS;
+}
+
+static int shrink_memcg(struct mem_cgroup *memcg)
+{
+ struct zswap_pool *pool;
+ int nid, shrunk = 0;
+
+ if (!mem_cgroup_zswap_writeback_enabled(memcg))
+ return -EINVAL;
+
+ /*
+ * Skip zombies because their LRUs are reparented and we would be
+ * reclaiming from the parent instead of the dead memcg.
+ */
+ if (memcg && !mem_cgroup_online(memcg))
+ return -ENOENT;
+
+ pool = zswap_pool_current_get();
+ if (!pool)
+ return -EINVAL;
+
+ for_each_node_state(nid, N_NORMAL_MEMORY) {
+ unsigned long nr_to_walk = 1;
+
+ shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
+ &shrink_memcg_cb, NULL, &nr_to_walk);
+ }
+ zswap_pool_put(pool);
+ return shrunk ? 0 : -EAGAIN;
+}
+
+static void shrink_worker(struct work_struct *w)
+{
+ struct zswap_pool *pool = container_of(w, typeof(*pool),
+ shrink_work);
+ struct mem_cgroup *memcg;
+ int ret, failures = 0;
+
+ /* global reclaim will select cgroup in a round-robin fashion. */
+ do {
+ spin_lock(&zswap_pools_lock);
+ pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+ memcg = pool->next_shrink;
+
+ /*
+ * We need to retry if we have gone through a full round trip, or if we
+ * got an offline memcg (or else we risk undoing the effect of the
+ * zswap memcg offlining cleanup callback). This is not catastrophic
+ * per se, but it will keep the now offlined memcg hostage for a while.
+ *
+ * Note that if we got an online memcg, we will keep the extra
+ * reference in case the original reference obtained by mem_cgroup_iter
+ * is dropped by the zswap memcg offlining callback, ensuring that the
+ * memcg is not killed when we are reclaiming.
+ */
+ if (!memcg) {
+ spin_unlock(&zswap_pools_lock);
+ if (++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+ goto resched;
+ }
+
+ if (!mem_cgroup_tryget_online(memcg)) {
+ /* drop the reference from mem_cgroup_iter() */
+ mem_cgroup_iter_break(NULL, memcg);
+ pool->next_shrink = NULL;
+ spin_unlock(&zswap_pools_lock);
+
+ if (++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+ goto resched;
+ }
+ spin_unlock(&zswap_pools_lock);
+
+ ret = shrink_memcg(memcg);
+ /* drop the extra reference */
+ mem_cgroup_put(memcg);
+
+ if (ret == -EINVAL)
+ break;
+ if (ret && ++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+resched:
+ cond_resched();
+ } while (!zswap_can_accept());
+ zswap_pool_put(pool);
+}
+
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
{
unsigned long *page;
@@ -1493,23 +1503,12 @@ static void zswap_fill_page(void *ptr, unsigned long value)
bool zswap_store(struct folio *folio)
{
swp_entry_t swp = folio->swap;
- int type = swp_type(swp);
pgoff_t offset = swp_offset(swp);
- struct page *page = &folio->page;
- struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry, *dupentry;
- struct scatterlist input, output;
- struct crypto_acomp_ctx *acomp_ctx;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
- struct zswap_pool *pool;
- struct zpool *zpool;
- unsigned int dlen = PAGE_SIZE;
- unsigned long handle, value;
- char *buf;
- u8 *src, *dst;
- gfp_t gfp;
- int ret;
+ struct zswap_pool *shrink_pool;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1518,24 +1517,8 @@ bool zswap_store(struct folio *folio)
if (folio_test_large(folio))
return false;
- if (!tree)
- return false;
-
- /*
- * If this is a duplicate, it must be removed before attempting to store
- * it, otherwise, if the store fails the old page won't be removed from
- * the tree, and it might be written back overriding the new data.
- */
- spin_lock(&tree->lock);
- dupentry = zswap_rb_search(&tree->rbroot, offset);
- if (dupentry) {
- zswap_duplicate_entry++;
- zswap_invalidate_entry(tree, dupentry);
- }
- spin_unlock(&tree->lock);
-
if (!zswap_enabled)
- return false;
+ goto check_old;
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
@@ -1562,17 +1545,19 @@ bool zswap_store(struct folio *folio)
}
/* allocate entry */
- entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
+ entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
if (!entry) {
zswap_reject_kmemcache_fail++;
goto reject;
}
if (zswap_same_filled_pages_enabled) {
- src = kmap_local_page(page);
+ unsigned long value;
+ u8 *src;
+
+ src = kmap_local_folio(folio, 0);
if (zswap_is_page_same_filled(src, &value)) {
kunmap_local(src);
- entry->swpentry = swp_entry(type, offset);
entry->length = 0;
entry->value = value;
atomic_inc(&zswap_same_filled_pages);
@@ -1598,67 +1583,11 @@ bool zswap_store(struct folio *folio)
mem_cgroup_put(memcg);
}
- /* compress */
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
- mutex_lock(&acomp_ctx->mutex);
-
- dst = acomp_ctx->buffer;
- sg_init_table(&input, 1);
- sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
-
- /*
- * We need PAGE_SIZE * 2 here since there maybe over-compression case,
- * and hardware-accelerators may won't check the dst buffer size, so
- * giving the dst buffer with enough length to avoid buffer overflow.
- */
- sg_init_one(&output, dst, PAGE_SIZE * 2);
- acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
- /*
- * it maybe looks a little bit silly that we send an asynchronous request,
- * then wait for its completion synchronously. This makes the process look
- * synchronous in fact.
- * Theoretically, acomp supports users send multiple acomp requests in one
- * acomp instance, then get those requests done simultaneously. but in this
- * case, zswap actually does store and load page by page, there is no
- * existing method to send the second page before the first page is done
- * in one thread doing zwap.
- * but in different threads running on different cpu, we have different
- * acomp instance, so multiple threads can do (de)compression in parallel.
- */
- ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
- dlen = acomp_ctx->req->dlen;
-
- if (ret) {
- zswap_reject_compress_fail++;
- goto put_dstmem;
- }
-
- /* store */
- zpool = zswap_find_zpool(entry);
- gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- if (zpool_malloc_support_movable(zpool))
- gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- ret = zpool_malloc(zpool, dlen, gfp, &handle);
- if (ret == -ENOSPC) {
- zswap_reject_compress_poor++;
- goto put_dstmem;
- }
- if (ret) {
- zswap_reject_alloc_fail++;
- goto put_dstmem;
- }
- buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
- memcpy(buf, dst, dlen);
- zpool_unmap_handle(zpool, handle);
- mutex_unlock(&acomp_ctx->mutex);
-
- /* populate entry */
- entry->swpentry = swp_entry(type, offset);
- entry->handle = handle;
- entry->length = dlen;
+ if (!zswap_compress(folio, entry))
+ goto put_pool;
insert_entry:
+ entry->swpentry = swp;
entry->objcg = objcg;
if (objcg) {
obj_cgroup_charge_zswap(objcg, entry->length);
@@ -1669,15 +1598,12 @@ insert_entry:
/* map */
spin_lock(&tree->lock);
/*
- * A duplicate entry should have been removed at the beginning of this
- * function. Since the swap entry should be pinned, if a duplicate is
- * found again here it means that something went wrong in the swap
- * cache.
+ * The folio may have been dirtied again, invalidate the
+ * possibly stale entry before inserting the new entry.
*/
- while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
- WARN_ON(1);
- zswap_duplicate_entry++;
+ if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
zswap_invalidate_entry(tree, dupentry);
+ WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry));
}
if (entry->length) {
INIT_LIST_HEAD(&entry->lru);
@@ -1693,8 +1619,6 @@ insert_entry:
return true;
-put_dstmem:
- mutex_unlock(&acomp_ctx->mutex);
put_pool:
zswap_pool_put(entry->pool);
freepage:
@@ -1702,38 +1626,48 @@ freepage:
reject:
if (objcg)
obj_cgroup_put(objcg);
+check_old:
+ /*
+ * If the zswap store fails or zswap is disabled, we must invalidate the
+ * possibly stale entry which was previously stored at this offset.
+ * Otherwise, writeback could overwrite the new data in the swapfile.
+ */
+ spin_lock(&tree->lock);
+ entry = zswap_rb_search(&tree->rbroot, offset);
+ if (entry)
+ zswap_invalidate_entry(tree, entry);
+ spin_unlock(&tree->lock);
return false;
shrink:
- pool = zswap_pool_last_get();
- if (pool && !queue_work(shrink_wq, &pool->shrink_work))
- zswap_pool_put(pool);
+ shrink_pool = zswap_pool_last_get();
+ if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work))
+ zswap_pool_put(shrink_pool);
goto reject;
}
bool zswap_load(struct folio *folio)
{
swp_entry_t swp = folio->swap;
- int type = swp_type(swp);
pgoff_t offset = swp_offset(swp);
struct page *page = &folio->page;
- struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
u8 *dst;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
- /* find */
spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
+ entry = zswap_rb_search(&tree->rbroot, offset);
if (!entry) {
spin_unlock(&tree->lock);
return false;
}
+ zswap_rb_erase(&tree->rbroot, entry);
spin_unlock(&tree->lock);
if (entry->length)
- __zswap_load(entry, page);
+ zswap_decompress(entry, page);
else {
dst = kmap_local_page(page);
zswap_fill_page(dst, entry->value);
@@ -1744,67 +1678,63 @@ bool zswap_load(struct folio *folio)
if (entry->objcg)
count_objcg_event(entry->objcg, ZSWPIN);
- spin_lock(&tree->lock);
- if (zswap_exclusive_loads_enabled) {
- zswap_invalidate_entry(tree, entry);
- folio_mark_dirty(folio);
- } else if (entry->length) {
- zswap_lru_del(&entry->pool->list_lru, entry);
- zswap_lru_add(&entry->pool->list_lru, entry);
- }
- zswap_entry_put(tree, entry);
- spin_unlock(&tree->lock);
+ zswap_entry_free(entry);
+
+ folio_mark_dirty(folio);
return true;
}
-void zswap_invalidate(int type, pgoff_t offset)
+void zswap_invalidate(swp_entry_t swp)
{
- struct zswap_tree *tree = zswap_trees[type];
+ pgoff_t offset = swp_offset(swp);
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
- /* find */
spin_lock(&tree->lock);
entry = zswap_rb_search(&tree->rbroot, offset);
- if (!entry) {
- /* entry was written back */
- spin_unlock(&tree->lock);
- return;
- }
- zswap_invalidate_entry(tree, entry);
+ if (entry)
+ zswap_invalidate_entry(tree, entry);
spin_unlock(&tree->lock);
}
-void zswap_swapon(int type)
+int zswap_swapon(int type, unsigned long nr_pages)
{
- struct zswap_tree *tree;
+ struct zswap_tree *trees, *tree;
+ unsigned int nr, i;
- tree = kzalloc(sizeof(*tree), GFP_KERNEL);
- if (!tree) {
+ nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+ trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
+ if (!trees) {
pr_err("alloc failed, zswap disabled for swap type %d\n", type);
- return;
+ return -ENOMEM;
}
- tree->rbroot = RB_ROOT;
- spin_lock_init(&tree->lock);
- zswap_trees[type] = tree;
+ for (i = 0; i < nr; i++) {
+ tree = trees + i;
+ tree->rbroot = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ }
+
+ nr_zswap_trees[type] = nr;
+ zswap_trees[type] = trees;
+ return 0;
}
void zswap_swapoff(int type)
{
- struct zswap_tree *tree = zswap_trees[type];
- struct zswap_entry *entry, *n;
+ struct zswap_tree *trees = zswap_trees[type];
+ unsigned int i;
- if (!tree)
+ if (!trees)
return;
- /* walk the tree and free everything */
- spin_lock(&tree->lock);
- rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- zswap_free_entry(entry);
- tree->rbroot = RB_ROOT;
- spin_unlock(&tree->lock);
- kfree(tree);
+ /* try_to_unuse() invalidated all the entries already */
+ for (i = 0; i < nr_zswap_trees[type]; i++)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot));
+
+ kvfree(trees);
+ nr_zswap_trees[type] = 0;
zswap_trees[type] = NULL;
}
@@ -1837,8 +1767,6 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_poor);
debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages);
- debugfs_create_u64("duplicate_entry", 0444,
- zswap_debugfs_root, &zswap_duplicate_entry);
debugfs_create_u64("pool_total_size", 0444,
zswap_debugfs_root, &zswap_pool_total_size);
debugfs_create_atomic_t("stored_pages", 0444,
@@ -1887,7 +1815,8 @@ static int zswap_setup(void)
zswap_enabled = false;
}
- shrink_wq = create_workqueue("zswap-shrink");
+ shrink_wq = alloc_workqueue("zswap-shrink",
+ WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
if (!shrink_wq)
goto fallback_fail;
diff --git a/scripts/gdb/linux/vmalloc.py b/scripts/gdb/linux/vmalloc.py
index d3c8a0274d1e..803f17371052 100644
--- a/scripts/gdb/linux/vmalloc.py
+++ b/scripts/gdb/linux/vmalloc.py
@@ -29,32 +29,34 @@ class LxVmallocInfo(gdb.Command):
if not constants.LX_CONFIG_MMU:
raise gdb.GdbError("Requires MMU support")
- vmap_area_list = gdb.parse_and_eval('vmap_area_list')
- for vmap_area in lists.list_for_each_entry(vmap_area_list, vmap_area_ptr_type, "list"):
- if not vmap_area['vm']:
- gdb.write("0x%x-0x%x %10d vm_map_ram\n" % (vmap_area['va_start'], vmap_area['va_end'],
- vmap_area['va_end'] - vmap_area['va_start']))
- continue
- v = vmap_area['vm']
- gdb.write("0x%x-0x%x %10d" % (v['addr'], v['addr'] + v['size'], v['size']))
- if v['caller']:
- gdb.write(" %s" % str(v['caller']).split(' ')[-1])
- if v['nr_pages']:
- gdb.write(" pages=%d" % v['nr_pages'])
- if v['phys_addr']:
- gdb.write(" phys=0x%x" % v['phys_addr'])
- if v['flags'] & constants.LX_VM_IOREMAP:
- gdb.write(" ioremap")
- if v['flags'] & constants.LX_VM_ALLOC:
- gdb.write(" vmalloc")
- if v['flags'] & constants.LX_VM_MAP:
- gdb.write(" vmap")
- if v['flags'] & constants.LX_VM_USERMAP:
- gdb.write(" user")
- if v['flags'] & constants.LX_VM_DMA_COHERENT:
- gdb.write(" dma-coherent")
- if is_vmalloc_addr(v['pages']):
- gdb.write(" vpages")
- gdb.write("\n")
+ nr_vmap_nodes = gdb.parse_and_eval('nr_vmap_nodes')
+ for i in range(0, nr_vmap_nodes):
+ vn = gdb.parse_and_eval('&vmap_nodes[%d]' % i)
+ for vmap_area in lists.list_for_each_entry(vn['busy']['head'], vmap_area_ptr_type, "list"):
+ if not vmap_area['vm']:
+ gdb.write("0x%x-0x%x %10d vm_map_ram\n" % (vmap_area['va_start'], vmap_area['va_end'],
+ vmap_area['va_end'] - vmap_area['va_start']))
+ continue
+ v = vmap_area['vm']
+ gdb.write("0x%x-0x%x %10d" % (v['addr'], v['addr'] + v['size'], v['size']))
+ if v['caller']:
+ gdb.write(" %s" % str(v['caller']).split(' ')[-1])
+ if v['nr_pages']:
+ gdb.write(" pages=%d" % v['nr_pages'])
+ if v['phys_addr']:
+ gdb.write(" phys=0x%x" % v['phys_addr'])
+ if v['flags'] & constants.LX_VM_IOREMAP:
+ gdb.write(" ioremap")
+ if v['flags'] & constants.LX_VM_ALLOC:
+ gdb.write(" vmalloc")
+ if v['flags'] & constants.LX_VM_MAP:
+ gdb.write(" vmap")
+ if v['flags'] & constants.LX_VM_USERMAP:
+ gdb.write(" user")
+ if v['flags'] & constants.LX_VM_DMA_COHERENT:
+ gdb.write(" dma-coherent")
+ if is_vmalloc_addr(v['pages']):
+ gdb.write(" vpages")
+ gdb.write("\n")
LxVmallocInfo()
diff --git a/tools/mm/Makefile b/tools/mm/Makefile
index 1c5606cc3334..7bb03606b9ea 100644
--- a/tools/mm/Makefile
+++ b/tools/mm/Makefile
@@ -3,7 +3,8 @@
#
include ../scripts/Makefile.include
-TARGETS=page-types slabinfo page_owner_sort
+BUILD_TARGETS=page-types slabinfo page_owner_sort
+INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps
LIB_DIR = ../lib/api
LIBS = $(LIB_DIR)/libapi.a
@@ -11,9 +12,9 @@ LIBS = $(LIB_DIR)/libapi.a
CFLAGS += -Wall -Wextra -I../lib/ -pthread
LDFLAGS += $(LIBS) -pthread
-all: $(TARGETS)
+all: $(BUILD_TARGETS)
-$(TARGETS): $(LIBS)
+$(BUILD_TARGETS): $(LIBS)
$(LIBS):
make -C $(LIB_DIR)
@@ -29,4 +30,4 @@ sbindir ?= /usr/sbin
install: all
install -d $(DESTDIR)$(sbindir)
- install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
+ install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps
new file mode 100644
index 000000000000..803e0318f2fe
--- /dev/null
+++ b/tools/mm/thpmaps
@@ -0,0 +1,675 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 ARM Ltd.
+#
+# Utility providing smaps-like output detailing transparent hugepage usage.
+# For more info, run:
+# ./thpmaps --help
+#
+# Requires numpy:
+# pip3 install numpy
+
+
+import argparse
+import collections
+import math
+import os
+import re
+import resource
+import shutil
+import sys
+import textwrap
+import time
+import numpy as np
+
+
+with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f:
+ PAGE_SIZE = resource.getpagesize()
+ PAGE_SHIFT = int(math.log2(PAGE_SIZE))
+ PMD_SIZE = int(f.read())
+ PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE))
+
+
+def align_forward(v, a):
+ return (v + (a - 1)) & ~(a - 1)
+
+
+def align_offset(v, a):
+ return v & (a - 1)
+
+
+def kbnr(kb):
+ # Convert KB to number of pages.
+ return (kb << 10) >> PAGE_SHIFT
+
+
+def nrkb(nr):
+ # Convert number of pages to KB.
+ return (nr << PAGE_SHIFT) >> 10
+
+
+def odkb(order):
+ # Convert page order to KB.
+ return (PAGE_SIZE << order) >> 10
+
+
+def cont_ranges_all(search, index):
+ # Given a list of arrays, find the ranges for which values are monotonically
+ # incrementing in all arrays. all arrays in search and index must be the
+ # same size.
+ sz = len(search[0])
+ r = np.full(sz, 2)
+ d = np.diff(search[0]) == 1
+ for dd in [np.diff(arr) == 1 for arr in search[1:]]:
+ d &= dd
+ r[1:] -= d
+ r[:-1] -= d
+ return [np.repeat(arr, r).reshape(-1, 2) for arr in index]
+
+
+class ArgException(Exception):
+ pass
+
+
+class FileIOException(Exception):
+ pass
+
+
+class BinArrayFile:
+ # Base class used to read /proc/<pid>/pagemap and /proc/kpageflags into a
+ # numpy array. Use inherrited class in a with clause to ensure file is
+ # closed when it goes out of scope.
+ def __init__(self, filename, element_size):
+ self.element_size = element_size
+ self.filename = filename
+ self.fd = os.open(self.filename, os.O_RDONLY)
+
+ def cleanup(self):
+ os.close(self.fd)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.cleanup()
+
+ def _readin(self, offset, buffer):
+ length = os.preadv(self.fd, (buffer,), offset)
+ if len(buffer) != length:
+ raise FileIOException('error: {} failed to read {} bytes at {:x}'
+ .format(self.filename, len(buffer), offset))
+
+ def _toarray(self, buf):
+ assert(self.element_size == 8)
+ return np.frombuffer(buf, dtype=np.uint64)
+
+ def getv(self, vec):
+ vec *= self.element_size
+ offsets = vec[:, 0]
+ lengths = (np.diff(vec) + self.element_size).reshape(len(vec))
+ buf = bytearray(int(np.sum(lengths)))
+ view = memoryview(buf)
+ pos = 0
+ for offset, length in zip(offsets, lengths):
+ offset = int(offset)
+ length = int(length)
+ self._readin(offset, view[pos:pos+length])
+ pos += length
+ return self._toarray(buf)
+
+ def get(self, index, nr=1):
+ offset = index * self.element_size
+ length = nr * self.element_size
+ buf = bytearray(length)
+ self._readin(offset, buf)
+ return self._toarray(buf)
+
+
+PM_PAGE_PRESENT = 1 << 63
+PM_PFN_MASK = (1 << 55) - 1
+
+class PageMap(BinArrayFile):
+ # Read ranges of a given pid's pagemap into a numpy array.
+ def __init__(self, pid='self'):
+ super().__init__(f'/proc/{pid}/pagemap', 8)
+
+
+KPF_ANON = 1 << 12
+KPF_COMPOUND_HEAD = 1 << 15
+KPF_COMPOUND_TAIL = 1 << 16
+KPF_THP = 1 << 22
+
+class KPageFlags(BinArrayFile):
+ # Read ranges of /proc/kpageflags into a numpy array.
+ def __init__(self):
+ super().__init__(f'/proc/kpageflags', 8)
+
+
+vma_all_stats = set([
+ "Size",
+ "Rss",
+ "Pss",
+ "Pss_Dirty",
+ "Shared_Clean",
+ "Shared_Dirty",
+ "Private_Clean",
+ "Private_Dirty",
+ "Referenced",
+ "Anonymous",
+ "KSM",
+ "LazyFree",
+ "AnonHugePages",
+ "ShmemPmdMapped",
+ "FilePmdMapped",
+ "Shared_Hugetlb",
+ "Private_Hugetlb",
+ "Swap",
+ "SwapPss",
+ "Locked",
+])
+
+vma_min_stats = set([
+ "Rss",
+ "Anonymous",
+ "AnonHugePages",
+ "ShmemPmdMapped",
+ "FilePmdMapped",
+])
+
+VMA = collections.namedtuple('VMA', [
+ 'name',
+ 'start',
+ 'end',
+ 'read',
+ 'write',
+ 'execute',
+ 'private',
+ 'pgoff',
+ 'major',
+ 'minor',
+ 'inode',
+ 'stats',
+])
+
+class VMAList:
+ # A container for VMAs, parsed from /proc/<pid>/smaps. Iterate over the
+ # instance to receive VMAs.
+ def __init__(self, pid='self', stats=[]):
+ self.vmas = []
+ with open(f'/proc/{pid}/smaps', 'r') as file:
+ for line in file:
+ elements = line.split()
+ if '-' in elements[0]:
+ start, end = map(lambda x: int(x, 16), elements[0].split('-'))
+ major, minor = map(lambda x: int(x, 16), elements[3].split(':'))
+ self.vmas.append(VMA(
+ name=elements[5] if len(elements) == 6 else '',
+ start=start,
+ end=end,
+ read=elements[1][0] == 'r',
+ write=elements[1][1] == 'w',
+ execute=elements[1][2] == 'x',
+ private=elements[1][3] == 'p',
+ pgoff=int(elements[2], 16),
+ major=major,
+ minor=minor,
+ inode=int(elements[4], 16),
+ stats={},
+ ))
+ else:
+ param = elements[0][:-1]
+ if param in stats:
+ value = int(elements[1])
+ self.vmas[-1].stats[param] = {'type': None, 'value': value}
+
+ def __iter__(self):
+ yield from self.vmas
+
+
+def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads):
+ # Given 4 same-sized arrays representing a range within a page table backed
+ # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+ # True if page is anonymous, heads: True if page is head of a THP), return a
+ # dictionary of statistics describing the mapped THPs.
+ stats = {
+ 'file': {
+ 'partial': 0,
+ 'aligned': [0] * (PMD_ORDER + 1),
+ 'unaligned': [0] * (PMD_ORDER + 1),
+ },
+ 'anon': {
+ 'partial': 0,
+ 'aligned': [0] * (PMD_ORDER + 1),
+ 'unaligned': [0] * (PMD_ORDER + 1),
+ },
+ }
+
+ for rindex, rpfn in zip(ranges[0], ranges[2]):
+ index_next = int(rindex[0])
+ index_end = int(rindex[1]) + 1
+ pfn_end = int(rpfn[1]) + 1
+
+ folios = indexes[index_next:index_end][heads[index_next:index_end]]
+
+ # Account pages for any partially mapped THP at the front. In that case,
+ # the first page of the range is a tail.
+ nr = (int(folios[0]) if len(folios) else index_end) - index_next
+ stats['anon' if anons[index_next] else 'file']['partial'] += nr
+
+ # Account pages for any partially mapped THP at the back. In that case,
+ # the next page after the range is a tail.
+ if len(folios):
+ flags = int(kpageflags.get(pfn_end)[0])
+ if flags & KPF_COMPOUND_TAIL:
+ nr = index_end - int(folios[-1])
+ folios = folios[:-1]
+ index_end -= nr
+ stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr
+
+ # Account fully mapped THPs in the middle of the range.
+ if len(folios):
+ folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1]))
+ folio_orders = np.log2(folio_nrs).astype(np.uint64)
+ for index, order in zip(folios, folio_orders):
+ index = int(index)
+ order = int(order)
+ nr = 1 << order
+ vfn = int(vfns[index])
+ align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned'
+ anon = 'anon' if anons[index] else 'file'
+ stats[anon][align][order] += nr
+
+ # Account PMD-mapped THPs spearately, so filter out of the stats. There is a
+ # race between acquiring the smaps stats and reading pagemap, where memory
+ # could be deallocated. So clamp to zero incase it would have gone negative.
+ anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+ file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+ vma.stats['FilePmdMapped']['value']
+ stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped))
+ stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped))
+
+ rstats = {
+ f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+ f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped},
+ }
+
+ def flatten_sub(type, subtype, stats):
+ param = f"{type}-thp-pte-{subtype}-{{}}kB"
+ for od, nr in enumerate(stats[2:], 2):
+ rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)}
+
+ def flatten_type(type, stats):
+ flatten_sub(type, 'aligned', stats['aligned'])
+ flatten_sub(type, 'unaligned', stats['unaligned'])
+ rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])}
+
+ flatten_type('anon', stats['anon'])
+ flatten_type('file', stats['file'])
+
+ return rstats
+
+
+def cont_parse(vma, order, ranges, anons, heads):
+ # Given 4 same-sized arrays representing a range within a page table backed
+ # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+ # True if page is anonymous, heads: True if page is head of a THP), return a
+ # dictionary of statistics describing the contiguous blocks.
+ nr_cont = 1 << order
+ nr_anon = 0
+ nr_file = 0
+
+ for rindex, rvfn, rpfn in zip(*ranges):
+ index_next = int(rindex[0])
+ index_end = int(rindex[1]) + 1
+ vfn_start = int(rvfn[0])
+ pfn_start = int(rpfn[0])
+
+ if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont):
+ continue
+
+ off = align_forward(vfn_start, nr_cont) - vfn_start
+ index_next += off
+
+ while index_next + nr_cont <= index_end:
+ folio_boundary = heads[index_next+1:index_next+nr_cont].any()
+ if not folio_boundary:
+ if anons[index_next]:
+ nr_anon += nr_cont
+ else:
+ nr_file += nr_cont
+ index_next += nr_cont
+
+ # Account blocks that are PMD-mapped spearately, so filter out of the stats.
+ # There is a race between acquiring the smaps stats and reading pagemap,
+ # where memory could be deallocated. So clamp to zero incase it would have
+ # gone negative.
+ anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+ file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+ vma.stats['FilePmdMapped']['value']
+ nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped))
+ nr_file = max(0, nr_file - kbnr(file_pmd_mapped))
+
+ rstats = {
+ f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+ f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped},
+ }
+
+ rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)}
+ rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)}
+
+ return rstats
+
+
+def vma_print(vma, pid):
+ # Prints a VMA instance in a format similar to smaps. The main difference is
+ # that the pid is included as the first value.
+ print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}"
+ .format(
+ pid, vma.start, vma.end,
+ 'r' if vma.read else '-', 'w' if vma.write else '-',
+ 'x' if vma.execute else '-', 'p' if vma.private else 's',
+ vma.pgoff, vma.major, vma.minor, vma.inode, vma.name
+ ))
+
+
+def stats_print(stats, tot_anon, tot_file, inc_empty):
+ # Print a statistics dictionary.
+ label_field = 32
+ for label, stat in stats.items():
+ type = stat['type']
+ value = stat['value']
+ if value or inc_empty:
+ pad = max(0, label_field - len(label) - 1)
+ if type == 'anon' and tot_anon > 0:
+ percent = f' ({value / tot_anon:3.0%})'
+ elif type == 'file' and tot_file > 0:
+ percent = f' ({value / tot_file:3.0%})'
+ else:
+ percent = ''
+ print(f"{label}:{' ' * pad}{value:8} kB{percent}")
+
+
+def vma_parse(vma, pagemap, kpageflags, contorders):
+ # Generate thp and cont statistics for a single VMA.
+ start = vma.start >> PAGE_SHIFT
+ end = vma.end >> PAGE_SHIFT
+
+ pmes = pagemap.get(start, end - start)
+ present = pmes & PM_PAGE_PRESENT != 0
+ pfns = pmes & PM_PFN_MASK
+ pfns = pfns[present]
+ vfns = np.arange(start, end, dtype=np.uint64)
+ vfns = vfns[present]
+
+ pfn_vec = cont_ranges_all([pfns], [pfns])[0]
+ flags = kpageflags.getv(pfn_vec)
+ anons = flags & KPF_ANON != 0
+ heads = flags & KPF_COMPOUND_HEAD != 0
+ thps = flags & KPF_THP != 0
+
+ vfns = vfns[thps]
+ pfns = pfns[thps]
+ anons = anons[thps]
+ heads = heads[thps]
+
+ indexes = np.arange(len(vfns), dtype=np.uint64)
+ ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns])
+
+ thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads)
+ contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders]
+
+ tot_anon = vma.stats['Anonymous']['value']
+ tot_file = vma.stats['Rss']['value'] - tot_anon
+
+ return {
+ **thpstats,
+ **{k: v for s in contstats for k, v in s.items()}
+ }, tot_anon, tot_file
+
+
+def do_main(args):
+ pids = set()
+ rollup = {}
+ rollup_anon = 0
+ rollup_file = 0
+
+ if args.cgroup:
+ strict = False
+ for walk_info in os.walk(args.cgroup):
+ cgroup = walk_info[0]
+ with open(f'{cgroup}/cgroup.procs') as pidfile:
+ for line in pidfile.readlines():
+ pids.add(int(line.strip()))
+ elif args.pid:
+ strict = True
+ pids = pids.union(args.pid)
+ else:
+ strict = False
+ for pid in os.listdir('/proc'):
+ if pid.isdigit():
+ pids.add(int(pid))
+
+ if not args.rollup:
+ print(" PID START END PROT OFFSET DEV INODE OBJECT")
+
+ for pid in pids:
+ try:
+ with PageMap(pid) as pagemap:
+ with KPageFlags() as kpageflags:
+ for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats):
+ if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0:
+ stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont)
+ else:
+ stats = {}
+ vma_anon = 0
+ vma_file = 0
+ if args.inc_smaps:
+ stats = {**vma.stats, **stats}
+ if args.rollup:
+ for k, v in stats.items():
+ if k in rollup:
+ assert(rollup[k]['type'] == v['type'])
+ rollup[k]['value'] += v['value']
+ else:
+ rollup[k] = v
+ rollup_anon += vma_anon
+ rollup_file += vma_file
+ else:
+ vma_print(vma, pid)
+ stats_print(stats, vma_anon, vma_file, args.inc_empty)
+ except (FileNotFoundError, ProcessLookupError, FileIOException):
+ if strict:
+ raise
+
+ if args.rollup:
+ stats_print(rollup, rollup_anon, rollup_file, args.inc_empty)
+
+
+def main():
+ docs_width = shutil.get_terminal_size().columns
+ docs_width -= 2
+ docs_width = min(80, docs_width)
+
+ def format(string):
+ text = re.sub(r'\s+', ' ', string)
+ text = re.sub(r'\s*\\n\s*', '\n', text)
+ paras = text.split('\n')
+ paras = [textwrap.fill(p, width=docs_width) for p in paras]
+ return '\n'.join(paras)
+
+ def formatter(prog):
+ return argparse.RawDescriptionHelpFormatter(prog, width=docs_width)
+
+ def size2order(human):
+ units = {
+ "K": 2**10, "M": 2**20, "G": 2**30,
+ "k": 2**10, "m": 2**20, "g": 2**30,
+ }
+ unit = 1
+ if human[-1] in units:
+ unit = units[human[-1]]
+ human = human[:-1]
+ try:
+ size = int(human)
+ except ValueError:
+ raise ArgException('error: --cont value must be integer size with optional KMG unit')
+ size *= unit
+ order = int(math.log2(size / PAGE_SIZE))
+ if order < 1:
+ raise ArgException('error: --cont value must be size of at least 2 pages')
+ if (1 << order) * PAGE_SIZE != size:
+ raise ArgException('error: --cont value must be size of power-of-2 pages')
+ if order > PMD_ORDER:
+ raise ArgException('error: --cont value must be less than or equal to PMD order')
+ return order
+
+ parser = argparse.ArgumentParser(formatter_class=formatter,
+ description=format("""Prints information about how transparent huge
+ pages are mapped, either system-wide, or for a specified
+ process or cgroup.\\n
+ \\n
+ When run with --pid, the user explicitly specifies the set
+ of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run
+ with --cgroup, the user passes either a v1 or v2 cgroup and
+ all pids that belong to the cgroup subtree are scanned. When
+ run with neither --pid nor --cgroup, the full set of pids on
+ the system is gathered from /proc and scanned as if the user
+ had provided "--pid 1 --pid 2 ...".\\n
+ \\n
+ A default set of statistics is always generated for THP
+ mappings. However, it is also possible to generate
+ additional statistics for "contiguous block mappings" where
+ the block size is user-defined.\\n
+ \\n
+ Statistics are maintained independently for anonymous and
+ file-backed (pagecache) memory and are shown both in kB and
+ as a percentage of either total anonymous or total
+ file-backed memory as appropriate.\\n
+ \\n
+ THP Statistics\\n
+ --------------\\n
+ \\n
+ Statistics are always generated for fully- and
+ contiguously-mapped THPs whose mapping address is aligned to
+ their size, for each <size> supported by the system.
+ Separate counters describe THPs mapped by PTE vs those
+ mapped by PMD. (Although note a THP can only be mapped by
+ PMD if it is PMD-sized):\\n
+ \\n
+ - anon-thp-pte-aligned-<size>kB\\n
+ - file-thp-pte-aligned-<size>kB\\n
+ - anon-thp-pmd-aligned-<size>kB\\n
+ - file-thp-pmd-aligned-<size>kB\\n
+ \\n
+ Similarly, statistics are always generated for fully- and
+ contiguously-mapped THPs whose mapping address is *not*
+ aligned to their size, for each <size> supported by the
+ system. Due to the unaligned mapping, it is impossible to
+ map by PMD, so there are only PTE counters for this case:\\n
+ \\n
+ - anon-thp-pte-unaligned-<size>kB\\n
+ - file-thp-pte-unaligned-<size>kB\\n
+ \\n
+ Statistics are also always generated for mapped pages that
+ belong to a THP but where the is THP is *not* fully- and
+ contiguously- mapped. These "partial" mappings are all
+ counted in the same counter regardless of the size of the
+ THP that is partially mapped:\\n
+ \\n
+ - anon-thp-pte-partial\\n
+ - file-thp-pte-partial\\n
+ \\n
+ Contiguous Block Statistics\\n
+ ---------------------------\\n
+ \\n
+ An optional, additional set of statistics is generated for
+ every contiguous block size specified with `--cont <size>`.
+ These statistics show how much memory is mapped in
+ contiguous blocks of <size> and also aligned to <size>. A
+ given contiguous block must all belong to the same THP, but
+ there is no requirement for it to be the *whole* THP.
+ Separate counters describe contiguous blocks mapped by PTE
+ vs those mapped by PMD:\\n
+ \\n
+ - anon-cont-pte-aligned-<size>kB\\n
+ - file-cont-pte-aligned-<size>kB\\n
+ - anon-cont-pmd-aligned-<size>kB\\n
+ - file-cont-pmd-aligned-<size>kB\\n
+ \\n
+ As an example, if monitoring 64K contiguous blocks (--cont
+ 64K), there are a number of sources that could provide such
+ blocks: a fully- and contiguously-mapped 64K THP that is
+ aligned to a 64K boundary would provide 1 block. A fully-
+ and contiguously-mapped 128K THP that is aligned to at least
+ a 64K boundary would provide 2 blocks. Or a 128K THP that
+ maps its first 100K, but contiguously and starting at a 64K
+ boundary would provide 1 block. A fully- and
+ contiguously-mapped 2M THP would provide 32 blocks. There
+ are many other possible permutations.\\n"""),
+ epilog=format("""Requires root privilege to access pagemap and
+ kpageflags."""))
+
+ group = parser.add_mutually_exclusive_group(required=False)
+ group.add_argument('--pid',
+ metavar='pid', required=False, type=int, default=[], action='append',
+ help="""Process id of the target process. Maybe issued multiple times to
+ scan multiple processes. --pid and --cgroup are mutually exclusive.
+ If neither are provided, all processes are scanned to provide
+ system-wide information.""")
+
+ group.add_argument('--cgroup',
+ metavar='path', required=False,
+ help="""Path to the target cgroup in sysfs. Iterates over every pid in
+ the cgroup and its children. --pid and --cgroup are mutually
+ exclusive. If neither are provided, all processes are scanned to
+ provide system-wide information.""")
+
+ parser.add_argument('--rollup',
+ required=False, default=False, action='store_true',
+ help="""Sum the per-vma statistics to provide a summary over the whole
+ system, process or cgroup.""")
+
+ parser.add_argument('--cont',
+ metavar='size[KMG]', required=False, default=[], action='append',
+ help="""Adds stats for memory that is mapped in contiguous blocks of
+ <size> and also aligned to <size>. May be issued multiple times to
+ track multiple sized blocks. Useful to infer e.g. arm64 contpte and
+ hpa mappings. Size must be a power-of-2 number of pages.""")
+
+ parser.add_argument('--inc-smaps',
+ required=False, default=False, action='store_true',
+ help="""Include all numerical, additive /proc/<pid>/smaps stats in the
+ output.""")
+
+ parser.add_argument('--inc-empty',
+ required=False, default=False, action='store_true',
+ help="""Show all statistics including those whose value is 0.""")
+
+ parser.add_argument('--periodic',
+ metavar='sleep_ms', required=False, type=int,
+ help="""Run in a loop, polling every sleep_ms milliseconds.""")
+
+ args = parser.parse_args()
+
+ try:
+ args.cont = [size2order(cont) for cont in args.cont]
+ except ArgException as e:
+ parser.print_usage()
+ raise
+
+ if args.periodic:
+ while True:
+ do_main(args)
+ print()
+ time.sleep(args.periodic / 1000)
+ else:
+ do_main(args)
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ except Exception as e:
+ prog = os.path.basename(sys.argv[0])
+ print(f'{prog}: {e}')
+ exit(1)
diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index 47fdaa146443..f0e488ed90d8 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -52,7 +52,7 @@ static int get_zswap_stored_pages(size_t *value)
static int get_cg_wb_count(const char *cg)
{
- return cg_read_key_long(cg, "memory.stat", "zswp_wb");
+ return cg_read_key_long(cg, "memory.stat", "zswpwb");
}
static long get_zswpout(const char *cgroup)
@@ -60,6 +60,27 @@ static long get_zswpout(const char *cgroup)
return cg_read_key_long(cgroup, "memory.stat", "zswpout ");
}
+static int allocate_and_read_bytes(const char *cgroup, void *arg)
+{
+ size_t size = (size_t)arg;
+ char *mem = (char *)malloc(size);
+ int ret = 0;
+
+ if (!mem)
+ return -1;
+ for (int i = 0; i < size; i += 4095)
+ mem[i] = 'a';
+
+ /* Go through the allocated memory to (z)swap in and out pages */
+ for (int i = 0; i < size; i += 4095) {
+ if (mem[i] != 'a')
+ ret = -1;
+ }
+
+ free(mem);
+ return ret;
+}
+
static int allocate_bytes(const char *cgroup, void *arg)
{
size_t size = (size_t)arg;
@@ -100,7 +121,6 @@ static int test_zswap_usage(const char *root)
int ret = KSFT_FAIL;
char *test_group;
- /* Set up */
test_group = cg_name(root, "no_shrink_test");
if (!test_group)
goto out;
@@ -134,6 +154,101 @@ out:
}
/*
+ * Check that when memory.zswap.max = 0, no pages can go to the zswap pool for
+ * the cgroup.
+ */
+static int test_swapin_nozswap(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *test_group;
+ long swap_peak, zswpout;
+
+ test_group = cg_name(root, "no_zswap_test");
+ if (!test_group)
+ goto out;
+ if (cg_create(test_group))
+ goto out;
+ if (cg_write(test_group, "memory.max", "8M"))
+ goto out;
+ if (cg_write(test_group, "memory.zswap.max", "0"))
+ goto out;
+
+ /* Allocate and read more than memory.max to trigger swapin */
+ if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
+ goto out;
+
+ /* Verify that pages are swapped out, but no zswap happened */
+ swap_peak = cg_read_long(test_group, "memory.swap.peak");
+ if (swap_peak < 0) {
+ ksft_print_msg("failed to get cgroup's swap_peak\n");
+ goto out;
+ }
+
+ if (swap_peak < MB(24)) {
+ ksft_print_msg("at least 24MB of memory should be swapped out\n");
+ goto out;
+ }
+
+ zswpout = get_zswpout(test_group);
+ if (zswpout < 0) {
+ ksft_print_msg("failed to get zswpout\n");
+ goto out;
+ }
+
+ if (zswpout > 0) {
+ ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n");
+ goto out;
+ }
+
+ ret = KSFT_PASS;
+
+out:
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+/* Simple test to verify the (z)swapin code paths */
+static int test_zswapin(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *test_group;
+ long zswpin;
+
+ test_group = cg_name(root, "zswapin_test");
+ if (!test_group)
+ goto out;
+ if (cg_create(test_group))
+ goto out;
+ if (cg_write(test_group, "memory.max", "8M"))
+ goto out;
+ if (cg_write(test_group, "memory.zswap.max", "max"))
+ goto out;
+
+ /* Allocate and read more than memory.max to trigger (z)swap in */
+ if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
+ goto out;
+
+ zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin ");
+ if (zswpin < 0) {
+ ksft_print_msg("failed to get zswpin\n");
+ goto out;
+ }
+
+ if (zswpin < MB(24) / PAGE_SIZE) {
+ ksft_print_msg("at least 24MB should be brought back from zswap\n");
+ goto out;
+ }
+
+ ret = KSFT_PASS;
+
+out:
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+/*
* When trying to store a memcg page in zswap, if the memcg hits its memory
* limit in zswap, writeback should affect only the zswapped pages of that
* memcg.
@@ -144,7 +259,6 @@ static int test_no_invasive_cgroup_shrink(const char *root)
size_t control_allocation_size = MB(10);
char *control_allocation, *wb_group = NULL, *control_group = NULL;
- /* Set up */
wb_group = setup_test_group_1M(root, "per_memcg_wb_test1");
if (!wb_group)
return KSFT_FAIL;
@@ -309,6 +423,8 @@ struct zswap_test {
const char *name;
} tests[] = {
T(test_zswap_usage),
+ T(test_swapin_nozswap),
+ T(test_zswapin),
T(test_no_kmem_bypass),
T(test_no_invasive_cgroup_shrink),
};
diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore
index c6c2965a6607..d861701f0327 100644
--- a/tools/testing/selftests/damon/.gitignore
+++ b/tools/testing/selftests/damon/.gitignore
@@ -1,2 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
huge_count_read_write
+debugfs_target_ids_read_before_terminate_race
+debugfs_target_ids_pid_leak
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 8a1cc2bf1864..789d6949c247 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -2,6 +2,8 @@
# Makefile for damon selftests
TEST_GEN_FILES += huge_count_read_write
+TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race
+TEST_GEN_FILES += debugfs_target_ids_pid_leak
TEST_GEN_FILES += access_memory
TEST_FILES = _chk_dependency.sh _debugfs_common.sh
@@ -9,9 +11,12 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
TEST_PROGS += debugfs_duplicate_context_creation.sh
TEST_PROGS += debugfs_rm_non_contexts.sh
+TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh
+TEST_PROGS += debugfs_target_ids_pid_leak.sh
TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh
TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py
TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py
+TEST_PROGS += damos_quota.py damos_apply_interval.py
TEST_PROGS += reclaim.sh lru_sort.sh
include ../lib.mk
diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh
index 0328ac0b5a5e..dda3a87dc00a 100644
--- a/tools/testing/selftests/damon/_chk_dependency.sh
+++ b/tools/testing/selftests/damon/_chk_dependency.sh
@@ -4,7 +4,14 @@
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
-DBGFS=/sys/kernel/debug/damon
+DBGFS=$(grep debugfs /proc/mounts --max-count 1 | awk '{print $2}')
+if [ "$DBGFS" = "" ]
+then
+ echo "debugfs not mounted"
+ exit $ksft_skip
+fi
+
+DBGFS+="/damon"
if [ $EUID -ne 0 ];
then
@@ -18,7 +25,14 @@ then
exit $ksft_skip
fi
-for f in attrs target_ids monitor_on
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+ monitor_on_file="monitor_on_DEPRECATED"
+else
+ monitor_on_file="monitor_on"
+fi
+
+for f in attrs target_ids "$monitor_on_file"
do
if [ ! -f "$DBGFS/$f" ]
then
@@ -28,7 +42,7 @@ do
done
permission_error="Operation not permitted"
-for f in attrs target_ids monitor_on
+for f in attrs target_ids "$monitor_on_file"
do
status=$( cat "$DBGFS/$f" 2>&1 )
if [ "${status#*$permission_error}" != "$status" ]; then
diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py
index e98cf4b6a4b7..d23d7398a27a 100644
--- a/tools/testing/selftests/damon/_damon_sysfs.py
+++ b/tools/testing/selftests/damon/_damon_sysfs.py
@@ -70,18 +70,65 @@ class DamosAccessPattern:
if err != None:
return err
+class DamosQuota:
+ sz = None # size quota, in bytes
+ ms = None # time quota
+ reset_interval_ms = None # quota reset interval
+ scheme = None # owner scheme
+
+ def __init__(self, sz=0, ms=0, reset_interval_ms=0):
+ self.sz = sz
+ self.ms = ms
+ self.reset_interval_ms = reset_interval_ms
+
+ def sysfs_dir(self):
+ return os.path.join(self.scheme.sysfs_dir(), 'quotas')
+
+ def stage(self):
+ err = write_file(os.path.join(self.sysfs_dir(), 'bytes'), self.sz)
+ if err != None:
+ return err
+ err = write_file(os.path.join(self.sysfs_dir(), 'ms'), self.ms)
+ if err != None:
+ return err
+ err = write_file(os.path.join(self.sysfs_dir(), 'reset_interval_ms'),
+ self.reset_interval_ms)
+ if err != None:
+ return err
+
+class DamosStats:
+ nr_tried = None
+ sz_tried = None
+ nr_applied = None
+ sz_applied = None
+ qt_exceeds = None
+
+ def __init__(self, nr_tried, sz_tried, nr_applied, sz_applied, qt_exceeds):
+ self.nr_tried = nr_tried
+ self.sz_tried = sz_tried
+ self.nr_applied = nr_applied
+ self.sz_applied = sz_applied
+ self.qt_exceeds = qt_exceeds
+
class Damos:
action = None
access_pattern = None
- # todo: Support quotas, watermarks, stats, tried_regions
+ quota = None
+ apply_interval_us = None
+ # todo: Support watermarks, stats, tried_regions
idx = None
context = None
tried_bytes = None
+ stats = None
- def __init__(self, action='stat', access_pattern=DamosAccessPattern()):
+ def __init__(self, action='stat', access_pattern=DamosAccessPattern(),
+ quota=DamosQuota(), apply_interval_us=0):
self.action = action
self.access_pattern = access_pattern
self.access_pattern.scheme = self
+ self.quota = quota
+ self.quota.scheme = self
+ self.apply_interval_us = apply_interval_us
def sysfs_dir(self):
return os.path.join(
@@ -94,13 +141,12 @@ class Damos:
err = self.access_pattern.stage()
if err != None:
return err
-
- # disable quotas
- err = write_file(os.path.join(self.sysfs_dir(), 'quotas', 'ms'), '0')
+ err = write_file(os.path.join(self.sysfs_dir(), 'apply_interval_us'),
+ '%d' % self.apply_interval_us)
if err != None:
return err
- err = write_file(
- os.path.join(self.sysfs_dir(), 'quotas', 'bytes'), '0')
+
+ err = self.quota.stage()
if err != None:
return err
@@ -298,6 +344,23 @@ class Kdamond:
return err
scheme.tried_bytes = int(content)
+ def update_schemes_stats(self):
+ err = write_file(os.path.join(self.sysfs_dir(), 'state'),
+ 'update_schemes_stats')
+ if err != None:
+ return err
+ for context in self.contexts:
+ for scheme in context.schemes:
+ stat_values = []
+ for stat in ['nr_tried', 'sz_tried', 'nr_applied',
+ 'sz_applied', 'qt_exceeds']:
+ content, err = read_file(
+ os.path.join(scheme.sysfs_dir(), 'stats', stat))
+ if err != None:
+ return err
+ stat_values.append(int(content))
+ scheme.stats = DamosStats(*stat_values)
+
class Kdamonds:
kdamonds = []
diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh
index 48989d4813ae..aa995516870b 100644
--- a/tools/testing/selftests/damon/_debugfs_common.sh
+++ b/tools/testing/selftests/damon/_debugfs_common.sh
@@ -45,6 +45,13 @@ test_content() {
source ./_chk_dependency.sh
damon_onoff="$DBGFS/monitor_on"
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+ damon_onoff="$DBGFS/monitor_on_DEPRECATED"
+else
+ damon_onoff="$DBGFS/monitor_on"
+fi
+
if [ $(cat "$damon_onoff") = "on" ]
then
echo "monitoring is on"
diff --git a/tools/testing/selftests/damon/damos_apply_interval.py b/tools/testing/selftests/damon/damos_apply_interval.py
new file mode 100644
index 000000000000..f04d43702481
--- /dev/null
+++ b/tools/testing/selftests/damon/damos_apply_interval.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+import time
+
+import _damon_sysfs
+
+def main():
+ # access two 10 MiB memory regions, 2 second per each
+ sz_region = 10 * 1024 * 1024
+ proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000'])
+
+ # Set quota up to 1 MiB per 100 ms
+ kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
+ contexts=[_damon_sysfs.DamonCtx(
+ ops='vaddr',
+ targets=[_damon_sysfs.DamonTarget(pid=proc.pid)],
+ schemes=[
+ _damon_sysfs.Damos(
+ access_pattern=_damon_sysfs.DamosAccessPattern(
+ # >= 25% access rate, >= 200ms age
+ nr_accesses=[5, 20], age=[2, 2**64 - 1]),
+ # aggregation interval (100 ms) is used
+ apply_interval_us=0),
+ # use 10ms apply interval
+ _damon_sysfs.Damos(
+ access_pattern=_damon_sysfs.DamosAccessPattern(
+ # >= 25% access rate, >= 200ms age
+ nr_accesses=[5, 20], age=[2, 2**64 - 1]),
+ # explicitly set 10 ms apply interval
+ apply_interval_us=10 * 1000)
+ ] # schemes
+ )] # contexts
+ )]) # kdamonds
+
+ err = kdamonds.start()
+ if err != None:
+ print('kdamond start failed: %s' % err)
+ exit(1)
+
+ wss_collected = []
+ nr_quota_exceeds = 0
+ while proc.poll() == None:
+ time.sleep(0.1)
+ err = kdamonds.kdamonds[0].update_schemes_stats()
+ if err != None:
+ print('stats update failed: %s' % err)
+ exit(1)
+ schemes = kdamonds.kdamonds[0].contexts[0].schemes
+ nr_tried_stats = [s.stats.nr_tried for s in schemes]
+ if nr_tried_stats[0] == 0 or nr_tried_stats[1] == 0:
+ print('scheme(s) are not tried')
+ exit(1)
+
+ # Because the second scheme was having the apply interval that is ten times
+ # lower than that of the first scheme, the second scheme should be tried
+ # about ten times more frequently than the first scheme. For possible
+ # timing errors, check if it was at least nine times more freuqnetly tried.
+ ratio = nr_tried_stats[1] / nr_tried_stats[0]
+ if ratio < 9:
+ print('%d / %d = %f (< 9)' %
+ (nr_tried_stats[1], nr_tried_stats[0], ratio))
+ exit(1)
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/testing/selftests/damon/damos_quota.py b/tools/testing/selftests/damon/damos_quota.py
new file mode 100644
index 000000000000..7d4c6bb2e3cd
--- /dev/null
+++ b/tools/testing/selftests/damon/damos_quota.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+import time
+
+import _damon_sysfs
+
+def main():
+ # access two 10 MiB memory regions, 2 second per each
+ sz_region = 10 * 1024 * 1024
+ proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000'])
+
+ # Set quota up to 1 MiB per 100 ms
+ sz_quota = 1024 * 1024 # 1 MiB
+ quota_reset_interval = 100 # 100 ms
+ kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
+ contexts=[_damon_sysfs.DamonCtx(
+ ops='vaddr',
+ targets=[_damon_sysfs.DamonTarget(pid=proc.pid)],
+ schemes=[_damon_sysfs.Damos(
+ access_pattern=_damon_sysfs.DamosAccessPattern(
+ # >= 25% access rate, >= 200ms age
+ nr_accesses=[5, 20], age=[2, 2**64 - 1]),
+ quota=_damon_sysfs.DamosQuota(
+ sz=sz_quota, reset_interval_ms=quota_reset_interval)
+ )] # schemes
+ )] # contexts
+ )]) # kdamonds
+
+ err = kdamonds.start()
+ if err != None:
+ print('kdamond start failed: %s' % err)
+ exit(1)
+
+ wss_collected = []
+ nr_quota_exceeds = 0
+ while proc.poll() == None:
+ time.sleep(0.1)
+ err = kdamonds.kdamonds[0].update_schemes_tried_bytes()
+ if err != None:
+ print('tried bytes update failed: %s' % err)
+ exit(1)
+ err = kdamonds.kdamonds[0].update_schemes_stats()
+ if err != None:
+ print('stats update failed: %s' % err)
+ exit(1)
+
+ scheme = kdamonds.kdamonds[0].contexts[0].schemes[0]
+ wss_collected.append(scheme.tried_bytes)
+ nr_quota_exceeds = scheme.stats.qt_exceeds
+
+ wss_collected.sort()
+ for wss in wss_collected:
+ if wss > sz_quota:
+ print('quota is not kept: %s > %s' % (wss, sz_quota))
+ print('collected samples are as below')
+ print('\n'.join(['%d' % wss for wss in wss_collected]))
+ exit(1)
+
+ if nr_quota_exceeds < len(wss_collected):
+ print('quota is not always exceeded: %d > %d' %
+ (len(wss_collected), nr_quota_exceeds))
+ exit(1)
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh
index 87aff8083822..effbea33dc16 100755
--- a/tools/testing/selftests/damon/debugfs_empty_targets.sh
+++ b/tools/testing/selftests/damon/debugfs_empty_targets.sh
@@ -8,6 +8,14 @@ source _debugfs_common.sh
orig_target_ids=$(cat "$DBGFS/target_ids")
echo "" > "$DBGFS/target_ids"
-orig_monitor_on=$(cat "$DBGFS/monitor_on")
-test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
+
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+ monitor_on_file="$DBGFS/monitor_on_DEPRECATED"
+else
+ monitor_on_file="$DBGFS/monitor_on"
+fi
+
+orig_monitor_on=$(cat "$monitor_on_file")
+test_write_fail "$monitor_on_file" "on" "orig_monitor_on" "empty target ids"
echo "$orig_target_ids" > "$DBGFS/target_ids"
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
new file mode 100644
index 000000000000..0cc2eef7d142
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids"
+
+static void write_targetid_exit(void)
+{
+ int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
+ char pid_str[128];
+
+ snprintf(pid_str, sizeof(pid_str), "%d", getpid());
+ write(target_ids_fd, pid_str, sizeof(pid_str));
+ close(target_ids_fd);
+ exit(0);
+}
+
+unsigned long msec_timestamp(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return tv.tv_sec * 1000UL + tv.tv_usec / 1000;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long start_ms;
+ int time_to_run, nr_forks = 0;
+
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <msecs to run>\n", argv[0]);
+ exit(1);
+ }
+ time_to_run = atoi(argv[1]);
+
+ start_ms = msec_timestamp();
+ while (true) {
+ int pid = fork();
+
+ if (pid < 0) {
+ fprintf(stderr, "fork() failed\n");
+ exit(1);
+ }
+ if (pid == 0)
+ write_targetid_exit();
+ wait(NULL);
+ nr_forks++;
+
+ if (msec_timestamp() - start_ms > time_to_run)
+ break;
+ }
+ printf("%d\n", nr_forks);
+ return 0;
+}
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
new file mode 100644
index 000000000000..31fe33c2b032
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+before=$(grep "^pid " /proc/slabinfo | awk '{print $2}')
+
+nr_leaks=$(./debugfs_target_ids_pid_leak 1000)
+expected_after_max=$((before + nr_leaks / 2))
+
+after=$(grep "^pid " /proc/slabinfo | awk '{print $2}')
+
+echo > /sys/kernel/debug/damon/target_ids
+
+echo "tried $nr_leaks pid leak"
+echo "number of active pid slabs: $before -> $after"
+echo "(up to $expected_after_max expected)"
+if [ $after -gt $expected_after_max ]
+then
+ echo "maybe pids are leaking"
+ exit 1
+else
+ exit 0
+fi
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
new file mode 100644
index 000000000000..b06f52a8ce2d
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#define DBGFS_MONITOR_ON "/sys/kernel/debug/damon/monitor_on_DEPRECATED"
+#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids"
+
+static void turn_damon_on_exit(void)
+{
+ int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
+ int monitor_on_fd = open(DBGFS_MONITOR_ON, O_RDWR);
+ char pid_str[128];
+
+ snprintf(pid_str, sizeof(pid_str), "%d", getpid());
+ write(target_ids_fd, pid_str, sizeof(pid_str));
+ write(monitor_on_fd, "on\n", 3);
+ close(target_ids_fd);
+ close(monitor_on_fd);
+ usleep(1000);
+ exit(0);
+}
+
+static void try_race(void)
+{
+ int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
+ int pid = fork();
+ int buf[256];
+
+ if (pid < 0) {
+ fprintf(stderr, "fork() failed\n");
+ exit(1);
+ }
+ if (pid == 0)
+ turn_damon_on_exit();
+ while (true) {
+ int status;
+
+ read(target_ids_fd, buf, sizeof(buf));
+ if (waitpid(-1, &status, WNOHANG) == pid)
+ break;
+ }
+ close(target_ids_fd);
+}
+
+static inline uint64_t ts_to_ms(struct timespec *ts)
+{
+ return (uint64_t)ts->tv_sec * 1000 + (uint64_t)ts->tv_nsec / 1000000;
+}
+
+int main(int argc, char *argv[])
+{
+ struct timespec start_time, now;
+ int runtime_ms;
+
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <runtime in ms>\n", argv[0]);
+ exit(1);
+ }
+ runtime_ms = atoi(argv[1]);
+ clock_gettime(CLOCK_MONOTONIC, &start_time);
+ while (true) {
+ try_race();
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ if (ts_to_ms(&now) - ts_to_ms(&start_time) > runtime_ms)
+ break;
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
new file mode 100644
index 000000000000..fc793c4c9aea
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+dmesg -C
+
+./debugfs_target_ids_read_before_terminate_race 5000
+
+if dmesg | grep -q dbgfs_target_ids_read
+then
+ dmesg
+ exit 1
+else
+ exit 0
+fi
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 3df008677239..18f585684e20 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -44,8 +44,6 @@
*/
static size_t mfd_def_size = MFD_DEF_SIZE;
static const char *memfd_str = MEMFD_STR;
-static int newpid_thread_fn2(void *arg);
-static void join_newpid_thread(pid_t pid);
static ssize_t fd2name(int fd, char *buf, size_t bufsize)
{
@@ -194,7 +192,6 @@ static unsigned int mfd_assert_get_seals(int fd)
static void mfd_assert_has_seals(int fd, unsigned int seals)
{
char buf[PATH_MAX];
- int nbytes;
unsigned int s;
fd2name(fd, buf, PATH_MAX);
@@ -696,7 +693,6 @@ static void mfd_assert_mode(int fd, int mode)
{
struct stat st;
char buf[PATH_MAX];
- int nbytes;
fd2name(fd, buf, PATH_MAX);
@@ -715,7 +711,6 @@ static void mfd_assert_mode(int fd, int mode)
static void mfd_assert_chmod(int fd, int mode)
{
char buf[PATH_MAX];
- int nbytes;
fd2name(fd, buf, PATH_MAX);
@@ -731,7 +726,6 @@ static void mfd_fail_chmod(int fd, int mode)
{
struct stat st;
char buf[PATH_MAX];
- int nbytes;
fd2name(fd, buf, PATH_MAX);
@@ -1254,9 +1248,6 @@ static void test_sysctl_set_sysctl2(void)
static int sysctl_simple_child(void *arg)
{
- int fd;
- int pid;
-
printf("%s sysctl 0\n", memfd_str);
test_sysctl_set_sysctl0();
@@ -1321,7 +1312,6 @@ static void test_sysctl_sysctl2_failset(void)
static int sysctl_nested_child(void *arg)
{
- int fd;
int pid;
printf("%s nested sysctl 0\n", memfd_str);
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 4ff10ea61461..d26e962f2ac4 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -46,3 +46,4 @@ gup_longterm
mkdirty
va_high_addr_switch
hugetlb_fault_after_madv
+hugetlb_madv_vs_map
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 2453add65d12..990e9bb112c5 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -70,6 +70,7 @@ TEST_GEN_FILES += ksm_tests
TEST_GEN_FILES += ksm_functional_tests
TEST_GEN_FILES += mdwe_test
TEST_GEN_FILES += hugetlb_fault_after_madv
+TEST_GEN_FILES += hugetlb_madv_vs_map
ifneq ($(ARCH),arm64)
TEST_GEN_FILES += soft-dirty
diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
index 656afba02dbc..533999b6c284 100644
--- a/tools/testing/selftests/mm/compaction_test.c
+++ b/tools/testing/selftests/mm/compaction_test.c
@@ -95,21 +95,22 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
if (fd < 0) {
- ksft_test_result_fail("Failed to open /proc/sys/vm/nr_hugepages: %s\n",
- strerror(errno));
- return -1;
+ ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
+ ret = -1;
+ goto out;
}
if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
- ksft_test_result_fail("Failed to read from /proc/sys/vm/nr_hugepages: %s\n",
- strerror(errno));
+ ksft_print_msg("Failed to read from /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
goto close_fd;
}
/* Start with the initial condition of 0 huge pages*/
if (write(fd, "0", sizeof(char)) != sizeof(char)) {
- ksft_test_result_fail("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n",
- strerror(errno));
+ ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
goto close_fd;
}
@@ -118,16 +119,16 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
/* Request a large number of huge pages. The Kernel will allocate
as much as it can */
if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
- ksft_test_result_fail("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n",
- strerror(errno));
+ ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
goto close_fd;
}
lseek(fd, 0, SEEK_SET);
if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
- ksft_test_result_fail("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n",
- strerror(errno));
+ ksft_print_msg("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
goto close_fd;
}
@@ -139,24 +140,26 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
!= strlen(initial_nr_hugepages)) {
- ksft_test_result_fail("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n",
- strerror(errno));
+ ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
goto close_fd;
}
+ ksft_print_msg("Number of huge pages allocated = %d\n",
+ atoi(nr_hugepages));
+
if (compaction_index > 3) {
ksft_print_msg("ERROR: Less that 1/%d of memory is available\n"
"as huge pages\n", compaction_index);
- ksft_test_result_fail("No of huge pages allocated = %d\n", (atoi(nr_hugepages)));
goto close_fd;
}
- ksft_test_result_pass("Memory compaction succeeded. No of huge pages allocated = %d\n",
- (atoi(nr_hugepages)));
ret = 0;
close_fd:
close(fd);
+ out:
+ ksft_test_result(ret == 0, "check_compaction\n");
return ret;
}
@@ -174,7 +177,7 @@ int main(int argc, char **argv)
ksft_print_header();
if (prereq() || geteuid())
- return ksft_exit_pass();
+ return ksft_exit_skip("Prerequisites unsatisfied\n");
ksft_set_plan(1);
diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
new file mode 100644
index 000000000000..d01e8d4901d0
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case that must run on a system with one and only one huge page available.
+ * # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+ *
+ * During setup, the test allocates the only available page, and starts three threads:
+ * - thread1:
+ * * madvise(MADV_DONTNEED) on the allocated huge page
+ * - thread 2:
+ * * Write to the allocated huge page
+ * - thread 3:
+ * * Try to allocated an extra huge page (which must not available)
+ *
+ * The test fails if thread3 is able to allocate a page.
+ *
+ * Touching the first page after thread3's allocation will raise a SIGBUS
+ *
+ * Author: Breno Leitao <leitao@debian.org>
+ */
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#define MMAP_SIZE (1 << 21)
+#define INLOOP_ITER 100
+
+char *huge_ptr;
+
+/* Touch the memory while it is being madvised() */
+void *touch(void *unused)
+{
+ for (int i = 0; i < INLOOP_ITER; i++)
+ huge_ptr[0] = '.';
+
+ return NULL;
+}
+
+void *madv(void *unused)
+{
+ for (int i = 0; i < INLOOP_ITER; i++)
+ madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED);
+
+ return NULL;
+}
+
+/*
+ * We got here, and there must be no huge page available for mapping
+ * The other hugepage should be flipping from used <-> reserved, because
+ * of madvise(DONTNEED).
+ */
+void *map_extra(void *unused)
+{
+ void *ptr;
+
+ for (int i = 0; i < INLOOP_ITER; i++) {
+ ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+
+ if ((long)ptr != -1) {
+ /* Touching the other page now will cause a SIGBUG
+ * huge_ptr[0] = '1';
+ */
+ return ptr;
+ }
+ }
+
+ return NULL;
+}
+
+int main(void)
+{
+ pthread_t thread1, thread2, thread3;
+ unsigned long free_hugepages;
+ void *ret;
+
+ /*
+ * On kernel 6.7, we are able to reproduce the problem with ~10
+ * interactions
+ */
+ int max = 10;
+
+ free_hugepages = get_free_hugepages();
+
+ if (free_hugepages != 1) {
+ ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n",
+ free_hugepages);
+ }
+
+ while (max--) {
+ huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+
+ if ((unsigned long)huge_ptr == -1) {
+ ksft_exit_skip("Failed to allocated huge page\n");
+ return KSFT_SKIP;
+ }
+
+ pthread_create(&thread1, NULL, madv, NULL);
+ pthread_create(&thread2, NULL, touch, NULL);
+ pthread_create(&thread3, NULL, map_extra, NULL);
+
+ pthread_join(thread1, NULL);
+ pthread_join(thread2, NULL);
+ pthread_join(thread3, &ret);
+
+ if (ret) {
+ ksft_test_result_fail("Unexpected huge page allocation\n");
+ return KSFT_FAIL;
+ }
+
+ /* Unmap and restart */
+ munmap(huge_ptr, MMAP_SIZE);
+ }
+
+ return KSFT_PASS;
+}
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index fbff0dd09191..d615767e396b 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -155,12 +155,12 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot,
/* Stabilize accounting by disabling KSM completely. */
if (ksm_unmerge()) {
ksft_test_result_fail("Disabling (unmerging) KSM failed\n");
- goto unmap;
+ return MAP_FAILED;
}
if (get_my_merging_pages() > 0) {
ksft_test_result_fail("Still pages merged\n");
- goto unmap;
+ return MAP_FAILED;
}
map = mmap(NULL, size, PROT_READ|PROT_WRITE,
diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c
index 598159f3df1f..b74813fdc951 100644
--- a/tools/testing/selftests/mm/map_fixed_noreplace.c
+++ b/tools/testing/selftests/mm/map_fixed_noreplace.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#include "../kselftest.h"
static void dump_maps(void)
{
@@ -28,15 +29,12 @@ static unsigned long find_base_addr(unsigned long size)
flags = MAP_PRIVATE | MAP_ANONYMOUS;
addr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
- if (addr == MAP_FAILED) {
- printf("Error: couldn't map the space we need for the test\n");
- return 0;
- }
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n");
+
+ if (munmap(addr, size) != 0)
+ ksft_exit_fail_msg("Error: munmap failed\n");
- if (munmap(addr, size) != 0) {
- printf("Error: couldn't map the space we need for the test\n");
- return 0;
- }
return (unsigned long)addr;
}
@@ -46,51 +44,39 @@ int main(void)
unsigned long flags, addr, size, page_size;
char *p;
+ ksft_print_header();
+ ksft_set_plan(9);
+
page_size = sysconf(_SC_PAGE_SIZE);
- //let's find a base addr that is free before we start the tests
+ /* let's find a base addr that is free before we start the tests */
size = 5 * page_size;
base_addr = find_base_addr(size);
- if (!base_addr) {
- printf("Error: couldn't map the space we need for the test\n");
- return 1;
- }
flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
- // Check we can map all the areas we need below
- errno = 0;
+ /* Check we can map all the areas we need below */
addr = base_addr;
size = 5 * page_size;
p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-
- printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
if (p == MAP_FAILED) {
dump_maps();
- printf("Error: couldn't map the space we need for the test\n");
- return 1;
+ ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n");
}
-
- errno = 0;
if (munmap((void *)addr, 5 * page_size) != 0) {
dump_maps();
- printf("Error: munmap failed!?\n");
- return 1;
+ ksft_exit_fail_msg("Error: munmap failed!?\n");
}
- printf("unmap() successful\n");
+ ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
- errno = 0;
addr = base_addr + page_size;
size = 3 * page_size;
p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
- printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
if (p == MAP_FAILED) {
dump_maps();
- printf("Error: first mmap() failed unexpectedly\n");
- return 1;
+ ksft_exit_fail_msg("Error: first mmap() failed unexpectedly\n");
}
+ ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
/*
* Exact same mapping again:
@@ -100,17 +86,14 @@ int main(void)
* +3 | mapped | new
* +4 | free | new
*/
- errno = 0;
addr = base_addr;
size = 5 * page_size;
p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
- printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
if (p != MAP_FAILED) {
dump_maps();
- printf("Error:1: mmap() succeeded when it shouldn't have\n");
- return 1;
+ ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n");
}
+ ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
/*
* Second mapping contained within first:
@@ -121,17 +104,14 @@ int main(void)
* +3 | mapped |
* +4 | free |
*/
- errno = 0;
addr = base_addr + (2 * page_size);
size = page_size;
p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
- printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
if (p != MAP_FAILED) {
dump_maps();
- printf("Error:2: mmap() succeeded when it shouldn't have\n");
- return 1;
+ ksft_exit_fail_msg("Error:2: mmap() succeeded when it shouldn't have\n");
}
+ ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
/*
* Overlap end of existing mapping:
@@ -141,17 +121,14 @@ int main(void)
* +3 | mapped | new
* +4 | free | new
*/
- errno = 0;
addr = base_addr + (3 * page_size);
size = 2 * page_size;
p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
- printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
if (p != MAP_FAILED) {
dump_maps();
- printf("Error:3: mmap() succeeded when it shouldn't have\n");
- return 1;
+ ksft_exit_fail_msg("Error:3: mmap() succeeded when it shouldn't have\n");
}
+ ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
/*
* Overlap start of existing mapping:
@@ -161,17 +138,14 @@ int main(void)
* +3 | mapped |
* +4 | free |
*/
- errno = 0;
addr = base_addr;
size = 2 * page_size;
p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
- printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
if (p != MAP_FAILED) {
dump_maps();
- printf("Error:4: mmap() succeeded when it shouldn't have\n");
- return 1;
+ ksft_exit_fail_msg("Error:4: mmap() succeeded when it shouldn't have\n");
}
+ ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
/*
* Adjacent to start of existing mapping:
@@ -181,17 +155,14 @@ int main(void)
* +3 | mapped |
* +4 | free |
*/
- errno = 0;
addr = base_addr;
size = page_size;
p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
- printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
if (p == MAP_FAILED) {
dump_maps();
- printf("Error:5: mmap() failed when it shouldn't have\n");
- return 1;
+ ksft_exit_fail_msg("Error:5: mmap() failed when it shouldn't have\n");
}
+ ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
/*
* Adjacent to end of existing mapping:
@@ -201,27 +172,22 @@ int main(void)
* +3 | mapped |
* +4 | free | new
*/
- errno = 0;
addr = base_addr + (4 * page_size);
size = page_size;
p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
- printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
if (p == MAP_FAILED) {
dump_maps();
- printf("Error:6: mmap() failed when it shouldn't have\n");
- return 1;
+ ksft_exit_fail_msg("Error:6: mmap() failed when it shouldn't have\n");
}
+ ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
addr = base_addr;
size = 5 * page_size;
if (munmap((void *)addr, size) != 0) {
dump_maps();
- printf("Error: munmap failed!?\n");
- return 1;
+ ksft_exit_fail_msg("Error: munmap failed!?\n");
}
- printf("unmap() successful\n");
+ ksft_test_result_pass("Base Address unmap() successful\n");
- printf("OK\n");
- return 0;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c
index 86e8f2048a40..a1f005a90a4f 100644
--- a/tools/testing/selftests/mm/map_hugetlb.c
+++ b/tools/testing/selftests/mm/map_hugetlb.c
@@ -16,6 +16,7 @@
#include <sys/mman.h>
#include <fcntl.h>
#include "vm_util.h"
+#include "../kselftest.h"
#define LENGTH (256UL*1024*1024)
#define PROTECTION (PROT_READ | PROT_WRITE)
@@ -31,7 +32,7 @@
static void check_bytes(char *addr)
{
- printf("First hex is %x\n", *((unsigned int *)addr));
+ ksft_print_msg("First hex is %x\n", *((unsigned int *)addr));
}
static void write_bytes(char *addr, size_t length)
@@ -42,23 +43,21 @@ static void write_bytes(char *addr, size_t length)
*(addr + i) = (char)i;
}
-static int read_bytes(char *addr, size_t length)
+static void read_bytes(char *addr, size_t length)
{
unsigned long i;
check_bytes(addr);
for (i = 0; i < length; i++)
- if (*(addr + i) != (char)i) {
- printf("Mismatch at %lu\n", i);
- return 1;
- }
- return 0;
+ if (*(addr + i) != (char)i)
+ ksft_exit_fail_msg("Mismatch at %lu\n", i);
+
+ ksft_test_result_pass("Read correct data\n");
}
int main(int argc, char **argv)
{
void *addr;
- int ret;
size_t hugepage_size;
size_t length = LENGTH;
int flags = FLAGS;
@@ -69,6 +68,9 @@ int main(int argc, char **argv)
if (hugepage_size > length)
length = hugepage_size;
+ ksft_print_header();
+ ksft_set_plan(1);
+
if (argc > 1)
length = atol(argv[1]) << 20;
if (argc > 2) {
@@ -78,27 +80,23 @@ int main(int argc, char **argv)
}
if (shift)
- printf("%u kB hugepages\n", 1 << (shift - 10));
+ ksft_print_msg("%u kB hugepages\n", 1 << (shift - 10));
else
- printf("Default size hugepages\n");
- printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
+ ksft_print_msg("Default size hugepages\n");
+ ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
- if (addr == MAP_FAILED) {
- perror("mmap");
- exit(1);
- }
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
- printf("Returned address is %p\n", addr);
+ ksft_print_msg("Returned address is %p\n", addr);
check_bytes(addr);
write_bytes(addr, length);
- ret = read_bytes(addr, length);
+ read_bytes(addr, length);
/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
- if (munmap(addr, length)) {
- perror("munmap");
- exit(1);
- }
+ if (munmap(addr, length))
+ ksft_exit_fail_msg("munmap: %s\n", strerror(errno));
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c
index 7945d0754875..5c8a53869b1b 100644
--- a/tools/testing/selftests/mm/map_populate.c
+++ b/tools/testing/selftests/mm/map_populate.c
@@ -16,19 +16,21 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include "../kselftest.h"
#define MMAP_SZ 4096
-#define BUG_ON(condition, description) \
- do { \
- if (condition) { \
- fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
- __LINE__, (description), strerror(errno)); \
- exit(1); \
- } \
+#define BUG_ON(condition, description) \
+ do { \
+ if (condition) \
+ ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n", \
+ __func__, __LINE__, (description), \
+ strerror(errno)); \
} while (0)
-static int parent_f(int sock, unsigned long *smap, int child)
+#define TESTS_IN_CHILD 2
+
+static void parent_f(int sock, unsigned long *smap, int child)
{
int status, ret;
@@ -43,9 +45,10 @@ static int parent_f(int sock, unsigned long *smap, int child)
BUG_ON(ret <= 0, "write(sock)");
waitpid(child, &status, 0);
- BUG_ON(!WIFEXITED(status), "child in unexpected state");
- return WEXITSTATUS(status);
+ /* The ksft macros don't keep counters between processes */
+ ksft_cnt.ksft_pass = WEXITSTATUS(status);
+ ksft_cnt.ksft_fail = TESTS_IN_CHILD - WEXITSTATUS(status);
}
static int child_f(int sock, unsigned long *smap, int fd)
@@ -64,10 +67,11 @@ static int child_f(int sock, unsigned long *smap, int fd)
ret = read(sock, &buf, sizeof(int));
BUG_ON(ret <= 0, "read(sock)");
- BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
- BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+ ksft_test_result(*smap != 0x22222BAD, "MAP_POPULATE COW private page\n");
+ ksft_test_result(*smap == 0xdeadbabe, "The mapping state\n");
- return 0;
+ /* The ksft macros don't keep counters between processes */
+ return ksft_cnt.ksft_pass;
}
int main(int argc, char **argv)
@@ -76,6 +80,9 @@ int main(int argc, char **argv)
FILE *ftmp;
unsigned long *smap;
+ ksft_print_header();
+ ksft_set_plan(TESTS_IN_CHILD);
+
ftmp = tmpfile();
BUG_ON(!ftmp, "tmpfile()");
@@ -101,7 +108,9 @@ int main(int argc, char **argv)
ret = close(sock[0]);
BUG_ON(ret, "close()");
- return parent_f(sock[1], smap, child);
+ parent_f(sock[1], smap, child);
+
+ ksft_finished();
}
ret = close(sock[1]);
diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c
index 1fba77df7f62..1cd80b0f76c3 100644
--- a/tools/testing/selftests/mm/mlock-random-test.c
+++ b/tools/testing/selftests/mm/mlock-random-test.c
@@ -13,6 +13,7 @@
#include <sys/ipc.h>
#include <sys/shm.h>
#include <time.h>
+#include "../kselftest.h"
#include "mlock2.h"
#define CHUNK_UNIT (128 * 1024)
@@ -31,14 +32,14 @@ int set_cap_limits(rlim_t max)
new.rlim_cur = max;
new.rlim_max = max;
if (setrlimit(RLIMIT_MEMLOCK, &new)) {
- perror("setrlimit() returns error\n");
+ ksft_perror("setrlimit() returns error\n");
return -1;
}
/* drop capabilities including CAP_IPC_LOCK */
if (cap_set_proc(cap)) {
- perror("cap_set_proc() returns error\n");
- return -2;
+ ksft_perror("cap_set_proc() returns error\n");
+ return -1;
}
return 0;
@@ -52,27 +53,24 @@ int get_proc_locked_vm_size(void)
unsigned long lock_size = 0;
f = fopen("/proc/self/status", "r");
- if (!f) {
- perror("fopen");
- return -1;
- }
+ if (!f)
+ ksft_exit_fail_msg("fopen: %s\n", strerror(errno));
while (fgets(line, 1024, f)) {
if (strstr(line, "VmLck")) {
ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
if (ret <= 0) {
- printf("sscanf() on VmLck error: %s: %d\n",
- line, ret);
fclose(f);
- return -1;
+ ksft_exit_fail_msg("sscanf() on VmLck error: %s: %d\n",
+ line, ret);
}
fclose(f);
return (int)(lock_size << 10);
}
}
- perror("cannot parse VmLck in /proc/self/status\n");
fclose(f);
+ ksft_exit_fail_msg("cannot parse VmLck in /proc/self/status: %s\n", strerror(errno));
return -1;
}
@@ -91,10 +89,8 @@ int get_proc_page_size(unsigned long addr)
size_t size;
smaps = seek_to_smaps_entry(addr);
- if (!smaps) {
- printf("Unable to parse /proc/self/smaps\n");
- return 0;
- }
+ if (!smaps)
+ ksft_exit_fail_msg("Unable to parse /proc/self/smaps\n");
while (getline(&line, &size, smaps) > 0) {
if (!strstr(line, "MMUPageSize")) {
@@ -105,12 +101,9 @@ int get_proc_page_size(unsigned long addr)
}
/* found the MMUPageSize of this section */
- if (sscanf(line, "MMUPageSize: %8lu kB",
- &mmupage_size) < 1) {
- printf("Unable to parse smaps entry for Size:%s\n",
- line);
- break;
- }
+ if (sscanf(line, "MMUPageSize: %8lu kB", &mmupage_size) < 1)
+ ksft_exit_fail_msg("Unable to parse smaps entry for Size:%s\n",
+ line);
}
free(line);
@@ -136,7 +129,7 @@ int get_proc_page_size(unsigned long addr)
* return value: 0 - success
* else: failure
*/
-int test_mlock_within_limit(char *p, int alloc_size)
+static void test_mlock_within_limit(char *p, int alloc_size)
{
int i;
int ret = 0;
@@ -145,11 +138,9 @@ int test_mlock_within_limit(char *p, int alloc_size)
int page_size = 0;
getrlimit(RLIMIT_MEMLOCK, &cur);
- if (cur.rlim_cur < alloc_size) {
- printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
- alloc_size, (unsigned int)cur.rlim_cur);
- return -1;
- }
+ if (cur.rlim_cur < alloc_size)
+ ksft_exit_fail_msg("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
srand(time(NULL));
for (i = 0; i < TEST_LOOP; i++) {
@@ -169,13 +160,11 @@ int test_mlock_within_limit(char *p, int alloc_size)
ret = mlock2_(p + start_offset, lock_size,
MLOCK_ONFAULT);
- if (ret) {
- printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
- is_mlock ? "mlock" : "mlock2",
- p, alloc_size,
- p + start_offset, lock_size);
- return ret;
- }
+ if (ret)
+ ksft_exit_fail_msg("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+ is_mlock ? "mlock" : "mlock2",
+ p, alloc_size,
+ p + start_offset, lock_size);
}
/*
@@ -183,18 +172,12 @@ int test_mlock_within_limit(char *p, int alloc_size)
*/
locked_vm_size = get_proc_locked_vm_size();
page_size = get_proc_page_size((unsigned long)p);
- if (page_size == 0) {
- printf("cannot get proc MMUPageSize\n");
- return -1;
- }
- if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
- printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
- locked_vm_size, alloc_size);
- return -1;
- }
+ if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size)
+ ksft_exit_fail_msg("%s left VmLck:%d on %d chunk\n",
+ __func__, locked_vm_size, alloc_size);
- return 0;
+ ksft_test_result_pass("%s\n", __func__);
}
@@ -213,7 +196,7 @@ int test_mlock_within_limit(char *p, int alloc_size)
* return value: 0 - success
* else: failure
*/
-int test_mlock_outof_limit(char *p, int alloc_size)
+static void test_mlock_outof_limit(char *p, int alloc_size)
{
int i;
int ret = 0;
@@ -221,11 +204,9 @@ int test_mlock_outof_limit(char *p, int alloc_size)
struct rlimit cur;
getrlimit(RLIMIT_MEMLOCK, &cur);
- if (cur.rlim_cur >= alloc_size) {
- printf("alloc_size[%d] >%u rlimit, violates test condition\n",
- alloc_size, (unsigned int)cur.rlim_cur);
- return -1;
- }
+ if (cur.rlim_cur >= alloc_size)
+ ksft_exit_fail_msg("alloc_size[%d] >%u rlimit, violates test condition\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
old_locked_vm_size = get_proc_locked_vm_size();
srand(time(NULL));
@@ -240,56 +221,47 @@ int test_mlock_outof_limit(char *p, int alloc_size)
else
ret = mlock2_(p + start_offset, lock_size,
MLOCK_ONFAULT);
- if (ret == 0) {
- printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
- is_mlock ? "mlock" : "mlock2",
- p, alloc_size,
- p + start_offset, lock_size);
- return -1;
- }
+ if (ret == 0)
+ ksft_exit_fail_msg("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+ is_mlock ? "mlock" : "mlock2",
+ p, alloc_size, p + start_offset, lock_size);
}
locked_vm_size = get_proc_locked_vm_size();
- if (locked_vm_size != old_locked_vm_size) {
- printf("tests leads to new mlocked page: old[%d], new[%d]\n",
- old_locked_vm_size,
- locked_vm_size);
- return -1;
- }
+ if (locked_vm_size != old_locked_vm_size)
+ ksft_exit_fail_msg("tests leads to new mlocked page: old[%d], new[%d]\n",
+ old_locked_vm_size,
+ locked_vm_size);
- return 0;
+ ksft_test_result_pass("%s\n", __func__);
}
int main(int argc, char **argv)
{
char *p = NULL;
- int ret = 0;
+
+ ksft_print_header();
if (set_cap_limits(MLOCK_RLIMIT_SIZE))
- return -1;
+ ksft_finished();
+
+ ksft_set_plan(2);
p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
- if (p == NULL) {
- perror("malloc() failure\n");
- return -1;
- }
- ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
- if (ret)
- return ret;
+ if (p == NULL)
+ ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno));
+
+ test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
free(p);
-
p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
- if (p == NULL) {
- perror("malloc() failure\n");
- return -1;
- }
- ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
- if (ret)
- return ret;
+ if (p == NULL)
+ ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno));
+
+ test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
free(p);
- return 0;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
index 80cddc0de206..26f744188ad0 100644
--- a/tools/testing/selftests/mm/mlock2-tests.c
+++ b/tools/testing/selftests/mm/mlock2-tests.c
@@ -7,9 +7,8 @@
#include <sys/time.h>
#include <sys/resource.h>
#include <stdbool.h>
-#include "mlock2.h"
-
#include "../kselftest.h"
+#include "mlock2.h"
struct vm_boundaries {
unsigned long start;
@@ -40,14 +39,14 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
while(fgets(line, 1024, file)) {
end_addr = strchr(line, '-');
if (!end_addr) {
- printf("cannot parse /proc/self/maps\n");
+ ksft_print_msg("cannot parse /proc/self/maps\n");
goto out;
}
*end_addr = '\0';
end_addr++;
stop = strchr(end_addr, ' ');
if (!stop) {
- printf("cannot parse /proc/self/maps\n");
+ ksft_print_msg("cannot parse /proc/self/maps\n");
goto out;
}
@@ -78,7 +77,7 @@ static bool is_vmflag_set(unsigned long addr, const char *vmflag)
smaps = seek_to_smaps_entry(addr);
if (!smaps) {
- printf("Unable to parse /proc/self/smaps\n");
+ ksft_print_msg("Unable to parse /proc/self/smaps\n");
goto out;
}
@@ -115,7 +114,7 @@ static unsigned long get_value_for_name(unsigned long addr, const char *name)
smaps = seek_to_smaps_entry(addr);
if (!smaps) {
- printf("Unable to parse /proc/self/smaps\n");
+ ksft_print_msg("Unable to parse /proc/self/smaps\n");
goto out;
}
@@ -129,7 +128,7 @@ static unsigned long get_value_for_name(unsigned long addr, const char *name)
value_ptr = line + strlen(name);
if (sscanf(value_ptr, "%lu kB", &value) < 1) {
- printf("Unable to parse smaps entry for Size\n");
+ ksft_print_msg("Unable to parse smaps entry for Size\n");
goto out;
}
break;
@@ -180,57 +179,45 @@ static int lock_check(unsigned long addr)
static int unlock_lock_check(char *map)
{
if (is_vmflag_set((unsigned long)map, LOCKED)) {
- printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+ ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED);
return 1;
}
return 0;
}
-static int test_mlock_lock()
+static void test_mlock_lock(void)
{
char *map;
- int ret = 1;
unsigned long page_size = getpagesize();
map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- if (map == MAP_FAILED) {
- perror("test_mlock_locked mmap");
- goto out;
- }
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
if (mlock2_(map, 2 * page_size, 0)) {
- if (errno == ENOSYS) {
- printf("Cannot call new mlock family, skipping test\n");
- _exit(KSFT_SKIP);
- }
- perror("mlock2(0)");
- goto unmap;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlock2(0): %s\n", strerror(errno));
}
- if (!lock_check((unsigned long)map))
- goto unmap;
+ ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__);
/* Now unlock and recheck attributes */
if (munlock(map, 2 * page_size)) {
- perror("munlock()");
- goto unmap;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
}
- ret = unlock_lock_check(map);
-
-unmap:
+ ksft_test_result(!unlock_lock_check(map), "%s: Locked\n", __func__);
munmap(map, 2 * page_size);
-out:
- return ret;
}
static int onfault_check(char *map)
{
*map = 'a';
if (!is_vma_lock_on_fault((unsigned long)map)) {
- printf("VMA is not marked for lock on fault\n");
+ ksft_print_msg("VMA is not marked for lock on fault\n");
return 1;
}
@@ -243,172 +230,131 @@ static int unlock_onfault_check(char *map)
if (is_vma_lock_on_fault((unsigned long)map) ||
is_vma_lock_on_fault((unsigned long)map + page_size)) {
- printf("VMA is still lock on fault after unlock\n");
+ ksft_print_msg("VMA is still lock on fault after unlock\n");
return 1;
}
return 0;
}
-static int test_mlock_onfault()
+static void test_mlock_onfault(void)
{
char *map;
- int ret = 1;
unsigned long page_size = getpagesize();
map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- if (map == MAP_FAILED) {
- perror("test_mlock_locked mmap");
- goto out;
- }
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
- if (errno == ENOSYS) {
- printf("Cannot call new mlock family, skipping test\n");
- _exit(KSFT_SKIP);
- }
- perror("mlock2(MLOCK_ONFAULT)");
- goto unmap;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlock2(MLOCK_ONFAULT): %s\n", strerror(errno));
}
- if (onfault_check(map))
- goto unmap;
+ ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__);
/* Now unlock and recheck attributes */
if (munlock(map, 2 * page_size)) {
- if (errno == ENOSYS) {
- printf("Cannot call new mlock family, skipping test\n");
- _exit(KSFT_SKIP);
- }
- perror("munlock()");
- goto unmap;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
}
- ret = unlock_onfault_check(map);
-unmap:
+ ksft_test_result(!unlock_onfault_check(map), "VMA open lock after fault\n");
munmap(map, 2 * page_size);
-out:
- return ret;
}
-static int test_lock_onfault_of_present()
+static void test_lock_onfault_of_present(void)
{
char *map;
- int ret = 1;
unsigned long page_size = getpagesize();
map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- if (map == MAP_FAILED) {
- perror("test_mlock_locked mmap");
- goto out;
- }
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
*map = 'a';
if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
- if (errno == ENOSYS) {
- printf("Cannot call new mlock family, skipping test\n");
- _exit(KSFT_SKIP);
- }
- perror("mlock2(MLOCK_ONFAULT)");
- goto unmap;
+ munmap(map, 2 * page_size);
+ ksft_test_result_fail("mlock2(MLOCK_ONFAULT) error: %s", strerror(errno));
}
- if (!is_vma_lock_on_fault((unsigned long)map) ||
- !is_vma_lock_on_fault((unsigned long)map + page_size)) {
- printf("VMA with present pages is not marked lock on fault\n");
- goto unmap;
- }
- ret = 0;
-unmap:
+ ksft_test_result(is_vma_lock_on_fault((unsigned long)map) ||
+ is_vma_lock_on_fault((unsigned long)map + page_size),
+ "VMA with present pages is not marked lock on fault\n");
munmap(map, 2 * page_size);
-out:
- return ret;
}
-static int test_munlockall()
+static void test_munlockall0(void)
{
char *map;
- int ret = 1;
unsigned long page_size = getpagesize();
map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
- if (map == MAP_FAILED) {
- perror("test_munlockall mmap");
- goto out;
- }
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s\n", strerror(errno));
if (mlockall(MCL_CURRENT)) {
- perror("mlockall(MCL_CURRENT)");
- goto out;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlockall(MCL_CURRENT): %s\n", strerror(errno));
}
- if (!lock_check((unsigned long)map))
- goto unmap;
+ ksft_test_result(lock_check((unsigned long)map), "%s: Locked memory area\n", __func__);
if (munlockall()) {
- perror("munlockall()");
- goto unmap;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno));
}
- if (unlock_lock_check(map))
- goto unmap;
-
+ ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
munmap(map, 2 * page_size);
+}
+
+static void test_munlockall1(void)
+{
+ char *map;
+ unsigned long page_size = getpagesize();
map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
- if (map == MAP_FAILED) {
- perror("test_munlockall second mmap");
- goto out;
- }
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
- perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
- goto unmap;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_ONFAULT): %s\n", strerror(errno));
}
- if (onfault_check(map))
- goto unmap;
+ ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__);
if (munlockall()) {
- perror("munlockall()");
- goto unmap;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno));
}
- if (unlock_onfault_check(map))
- goto unmap;
+ ksft_test_result(!unlock_onfault_check(map), "%s: Unlocked\n", __func__);
if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
- perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
- goto out;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_FUTURE): %s\n", strerror(errno));
}
- if (!lock_check((unsigned long)map))
- goto unmap;
+ ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__);
if (munlockall()) {
- perror("munlockall()");
- goto unmap;
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlockall() %s\n", strerror(errno));
}
- ret = unlock_lock_check(map);
-
-unmap:
+ ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
munmap(map, 2 * page_size);
-out:
- munlockall();
- return ret;
}
-static int test_vma_management(bool call_mlock)
+static void test_vma_management(bool call_mlock)
{
- int ret = 1;
void *map;
unsigned long page_size = getpagesize();
struct vm_boundaries page1;
@@ -417,25 +363,19 @@ static int test_vma_management(bool call_mlock)
map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- if (map == MAP_FAILED) {
- perror("mmap()");
- return ret;
- }
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
- if (errno == ENOSYS) {
- printf("Cannot call new mlock family, skipping test\n");
- _exit(KSFT_SKIP);
- }
- perror("mlock(ONFAULT)\n");
- goto out;
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("mlock error: %s", strerror(errno));
}
if (get_vm_area((unsigned long)map, &page1) ||
get_vm_area((unsigned long)map + page_size, &page2) ||
get_vm_area((unsigned long)map + page_size * 2, &page3)) {
- printf("couldn't find mapping in /proc/self/maps\n");
- goto out;
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
}
/*
@@ -444,76 +384,86 @@ static int test_vma_management(bool call_mlock)
* not a failure)
*/
if (page1.start != page2.start || page2.start != page3.start) {
- printf("VMAs are not merged to start, aborting test\n");
- ret = 0;
- goto out;
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("VMAs are not merged to start, aborting test");
}
if (munlock(map + page_size, page_size)) {
- perror("munlock()");
- goto out;
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("munlock(): %s", strerror(errno));
}
if (get_vm_area((unsigned long)map, &page1) ||
get_vm_area((unsigned long)map + page_size, &page2) ||
get_vm_area((unsigned long)map + page_size * 2, &page3)) {
- printf("couldn't find mapping in /proc/self/maps\n");
- goto out;
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
}
/* All three VMAs should be different */
if (page1.start == page2.start || page2.start == page3.start) {
- printf("failed to split VMA for munlock\n");
- goto out;
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("failed to split VMA for munlock");
}
/* Now unlock the first and third page and check the VMAs again */
if (munlock(map, page_size * 3)) {
- perror("munlock()");
- goto out;
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("munlock(): %s", strerror(errno));
}
if (get_vm_area((unsigned long)map, &page1) ||
get_vm_area((unsigned long)map + page_size, &page2) ||
get_vm_area((unsigned long)map + page_size * 2, &page3)) {
- printf("couldn't find mapping in /proc/self/maps\n");
- goto out;
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
}
/* Now all three VMAs should be the same */
if (page1.start != page2.start || page2.start != page3.start) {
- printf("failed to merge VMAs after munlock\n");
- goto out;
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("failed to merge VMAs after munlock");
}
- ret = 0;
-out:
+ ksft_test_result_pass("%s call_mlock %d\n", __func__, call_mlock);
munmap(map, 3 * page_size);
- return ret;
}
-static int test_mlockall(int (test_function)(bool call_mlock))
+static void test_mlockall(void)
{
- int ret = 1;
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE))
+ ksft_exit_fail_msg("mlockall failed: %s\n", strerror(errno));
- if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
- perror("mlockall");
- return ret;
- }
-
- ret = test_function(false);
+ test_vma_management(false);
munlockall();
- return ret;
}
int main(int argc, char **argv)
{
- int ret = 0;
- ret += test_mlock_lock();
- ret += test_mlock_onfault();
- ret += test_munlockall();
- ret += test_lock_onfault_of_present();
- ret += test_vma_management(true);
- ret += test_mlockall(test_vma_management);
- return ret;
+ int ret, size = 3 * getpagesize();
+ void *map;
+
+ ksft_print_header();
+
+ map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
+
+ ret = mlock2_(map, size, MLOCK_ONFAULT);
+ if (ret && errno == ENOSYS)
+ ksft_finished();
+
+ munmap(map, size);
+
+ ksft_set_plan(13);
+
+ test_mlock_lock();
+ test_mlock_onfault();
+ test_munlockall0();
+ test_munlockall1();
+ test_lock_onfault_of_present();
+ test_vma_management(true);
+ test_mlockall();
+
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h
index 8e02991b313c..4417eaa5cfb7 100644
--- a/tools/testing/selftests/mm/mlock2.h
+++ b/tools/testing/selftests/mm/mlock2.h
@@ -6,12 +6,7 @@
static int mlock2_(void *start, size_t len, int flags)
{
-#ifdef __NR_mlock2
return syscall(__NR_mlock2, start, len, flags);
-#else
- errno = ENOSYS;
- return -1;
-#endif
}
static FILE *seek_to_smaps_entry(unsigned long addr)
@@ -27,10 +22,8 @@ static FILE *seek_to_smaps_entry(unsigned long addr)
char path[BUFSIZ];
file = fopen("/proc/self/smaps", "r");
- if (!file) {
- perror("fopen smaps");
- _exit(1);
- }
+ if (!file)
+ ksft_exit_fail_msg("fopen smaps: %s\n", strerror(errno));
while (getline(&line, &size, file) > 0) {
if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c
index d822004a374e..100370a7111d 100644
--- a/tools/testing/selftests/mm/mrelease_test.c
+++ b/tools/testing/selftests/mm/mrelease_test.c
@@ -26,19 +26,15 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd)
buf = (char *)mmap(NULL, nr_pages * psize(), PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON, 0, 0);
- if (buf == MAP_FAILED) {
- perror("mmap failed, halting the test");
- return KSFT_FAIL;
- }
+ if (buf == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed, halting the test: %s\n", strerror(errno));
for (i = 0; i < nr_pages; i++)
*((unsigned long *)(buf + (i * psize()))) = i;
/* Signal the parent that the child is ready */
- if (write(pipefd, "", 1) < 0) {
- perror("write");
- return KSFT_FAIL;
- }
+ if (write(pipefd, "", 1) < 0)
+ ksft_exit_fail_msg("write: %s\n", strerror(errno));
/* Wait to be killed (when reparenting happens) */
while (getppid() == ppid && timeout > 0) {
@@ -54,23 +50,17 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd)
/* The process_mrelease calls in this test are expected to fail */
static void run_negative_tests(int pidfd)
{
- int res;
/* Test invalid flags. Expect to fail with EINVAL error code. */
if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) ||
errno != EINVAL) {
- res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
- perror("process_mrelease with wrong flags");
- exit(res);
+ ksft_exit_fail_msg("process_mrelease with wrong flags: %s\n", strerror(errno));
}
/*
* Test reaping while process is alive with no pending SIGKILL.
* Expect to fail with EINVAL error code.
*/
- if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) {
- res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
- perror("process_mrelease on a live process");
- exit(res);
- }
+ if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL)
+ ksft_exit_fail_msg("process_mrelease on a live process: %s\n", strerror(errno));
}
static int child_main(int pipefd[], size_t size)
@@ -93,11 +83,18 @@ int main(void)
char byte;
int res;
+ ksft_print_header();
+ ksft_set_plan(1);
+
/* Test a wrong pidfd */
if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) {
- res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
- perror("process_mrelease with wrong pidfd");
- exit(res);
+ if (errno == ENOSYS) {
+ ksft_test_result_skip("process_mrelease not implemented\n");
+ ksft_finished();
+ } else {
+ ksft_exit_fail_msg("process_mrelease with wrong pidfd: %s",
+ strerror(errno));
+ }
}
/* Start the test with 1MB child memory allocation */
@@ -107,16 +104,14 @@ retry:
* Pipe for the child to signal when it's done allocating
* memory
*/
- if (pipe(pipefd)) {
- perror("pipe");
- exit(KSFT_FAIL);
- }
+ if (pipe(pipefd))
+ ksft_exit_fail_msg("pipe: %s\n", strerror(errno));
+
pid = fork();
if (pid < 0) {
- perror("fork");
close(pipefd[0]);
close(pipefd[1]);
- exit(KSFT_FAIL);
+ ksft_exit_fail_msg("fork: %s\n", strerror(errno));
}
if (pid == 0) {
@@ -134,28 +129,23 @@ retry:
res = read(pipefd[0], &byte, 1);
close(pipefd[0]);
if (res < 0) {
- perror("read");
if (!kill(pid, SIGKILL))
waitpid(pid, NULL, 0);
- exit(KSFT_FAIL);
+ ksft_exit_fail_msg("read: %s\n", strerror(errno));
}
pidfd = syscall(__NR_pidfd_open, pid, 0);
if (pidfd < 0) {
- perror("pidfd_open");
if (!kill(pid, SIGKILL))
waitpid(pid, NULL, 0);
- exit(KSFT_FAIL);
+ ksft_exit_fail_msg("pidfd_open: %s\n", strerror(errno));
}
/* Run negative tests which require a live child */
run_negative_tests(pidfd);
- if (kill(pid, SIGKILL)) {
- res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
- perror("kill");
- exit(res);
- }
+ if (kill(pid, SIGKILL))
+ ksft_exit_fail_msg("kill: %s\n", strerror(errno));
success = (syscall(__NR_process_mrelease, pidfd, 0) == 0);
if (!success) {
@@ -169,18 +159,15 @@ retry:
if (errno == ESRCH) {
retry = (size <= MAX_SIZE_MB);
} else {
- res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
- perror("process_mrelease");
waitpid(pid, NULL, 0);
- exit(res);
+ ksft_exit_fail_msg("process_mrelease: %s\n", strerror(errno));
}
}
/* Cleanup to prevent zombies */
- if (waitpid(pid, NULL, 0) < 0) {
- perror("waitpid");
- exit(KSFT_FAIL);
- }
+ if (waitpid(pid, NULL, 0) < 0)
+ ksft_exit_fail_msg("waitpid: %s\n", strerror(errno));
+
close(pidfd);
if (!success) {
@@ -188,11 +175,10 @@ retry:
size *= 2;
goto retry;
}
- printf("All process_mrelease attempts failed!\n");
- exit(KSFT_FAIL);
+ ksft_exit_fail_msg("All process_mrelease attempts failed!\n");
}
- printf("Success reaping a child with %zuMB of memory allocations\n",
- size);
- return KSFT_PASS;
+ ksft_test_result_pass("Success reaping a child with %zuMB of memory allocations\n",
+ size);
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c
index a06e73ec8568..1d75084b9ca5 100644
--- a/tools/testing/selftests/mm/mremap_dontunmap.c
+++ b/tools/testing/selftests/mm/mremap_dontunmap.c
@@ -27,14 +27,14 @@ static void dump_maps(void)
system(cmd);
}
-#define BUG_ON(condition, description) \
- do { \
- if (condition) { \
- fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
- __LINE__, (description), strerror(errno)); \
- dump_maps(); \
- exit(1); \
- } \
+#define BUG_ON(condition, description) \
+ do { \
+ if (condition) { \
+ dump_maps(); \
+ ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n", \
+ __func__, __LINE__, (description), \
+ strerror(errno)); \
+ } \
} while (0)
// Try a simple operation for to "test" for kernel support this prevents
@@ -122,6 +122,7 @@ static void mremap_dontunmap_simple()
"unable to unmap destination mapping");
BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
"unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
}
// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected.
@@ -173,6 +174,7 @@ static void mremap_dontunmap_simple_shmem()
"unable to unmap destination mapping");
BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
"unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
}
// This test validates MREMAP_DONTUNMAP will move page tables to a specific
@@ -219,6 +221,7 @@ static void mremap_dontunmap_simple_fixed()
"unable to unmap destination mapping");
BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
"unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
}
// This test validates that we can MREMAP_DONTUNMAP for a portion of an
@@ -269,6 +272,7 @@ static void mremap_dontunmap_partial_mapping()
"unable to unmap destination mapping");
BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
"unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
}
// This test validates that we can remap over only a portion of a mapping.
@@ -328,19 +332,24 @@ static void mremap_dontunmap_partial_mapping_overwrite(void)
"unable to unmap destination mapping");
BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
"unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
}
int main(void)
{
+ ksft_print_header();
+
page_size = sysconf(_SC_PAGE_SIZE);
// test for kernel support for MREMAP_DONTUNMAP skipping the test if
// not.
if (kernel_support_for_mremap_dontunmap() != 0) {
- printf("No kernel support for MREMAP_DONTUNMAP\n");
- return KSFT_SKIP;
+ ksft_print_msg("No kernel support for MREMAP_DONTUNMAP\n");
+ ksft_finished();
}
+ ksft_set_plan(5);
+
// Keep a page sized buffer around for when we need it.
page_buffer =
mmap(NULL, page_size, PROT_READ | PROT_WRITE,
@@ -356,6 +365,5 @@ int main(void)
BUG_ON(munmap(page_buffer, page_size) == -1,
"unable to unmap page buffer");
- printf("OK\n");
- return 0;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 246d53a5d7f2..de03d38907d6 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -64,6 +64,8 @@ separated by spaces:
test copy-on-write semantics
- thp
test transparent huge pages
+- hugetlb
+ test hugetlbfs huge pages
- migration
invoke move_pages(2) to exercise the migration entry code
paths in the kernel
@@ -206,6 +208,15 @@ pretty_name() {
# Usage: run_test [test binary] [arbitrary test arguments...]
run_test() {
if test_selected ${CATEGORY}; then
+ # On memory constrainted systems some tests can fail to allocate hugepages.
+ # perform some cleanup before the test for a higher success rate.
+ if [ ${CATEGORY} == "thp" ] | [ ${CATEGORY} == "hugetlb" ]; then
+ echo 3 > /proc/sys/vm/drop_caches
+ sleep 2
+ echo 1 > /proc/sys/vm/compact_memory
+ sleep 2
+ fi
+
local test=$(pretty_name "$*")
local title="running $*"
local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
@@ -253,6 +264,7 @@ nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages)
# For this test, we need one and just one huge page
echo 1 > /proc/sys/vm/nr_hugepages
CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv
+CATEGORY="hugetlb" run_test ./hugetlb_madv_vs_map
# Restore the previous number of huge pages, since further tests rely on it
echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 0e74635c8c3d..7b698a848bab 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -17,6 +17,7 @@
#include <malloc.h>
#include <stdbool.h>
#include "vm_util.h"
+#include "../kselftest.h"
uint64_t pagesize;
unsigned int pageshift;
@@ -50,21 +51,19 @@ int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
return 0;
}
-static int write_file(const char *path, const char *buf, size_t buflen)
+static void write_file(const char *path, const char *buf, size_t buflen)
{
int fd;
ssize_t numwritten;
fd = open(path, O_WRONLY);
if (fd == -1)
- return 0;
+ ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno));
numwritten = write(fd, buf, buflen - 1);
close(fd);
if (numwritten < 1)
- return 0;
-
- return (unsigned int) numwritten;
+ ksft_exit_fail_msg("Write failed\n");
}
static void write_debugfs(const char *fmt, ...)
@@ -77,15 +76,10 @@ static void write_debugfs(const char *fmt, ...)
ret = vsnprintf(input, INPUT_MAX, fmt, argp);
va_end(argp);
- if (ret >= INPUT_MAX) {
- printf("%s: Debugfs input is too long\n", __func__);
- exit(EXIT_FAILURE);
- }
+ if (ret >= INPUT_MAX)
+ ksft_exit_fail_msg("%s: Debugfs input is too long\n", __func__);
- if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
- perror(SPLIT_DEBUGFS);
- exit(EXIT_FAILURE);
- }
+ write_file(SPLIT_DEBUGFS, input, ret + 1);
}
void split_pmd_thp(void)
@@ -95,39 +89,30 @@ void split_pmd_thp(void)
size_t i;
one_page = memalign(pmd_pagesize, len);
-
- if (!one_page) {
- printf("Fail to allocate memory\n");
- exit(EXIT_FAILURE);
- }
+ if (!one_page)
+ ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
madvise(one_page, len, MADV_HUGEPAGE);
for (i = 0; i < len; i++)
one_page[i] = (char)i;
- if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
- printf("No THP is allocated\n");
- exit(EXIT_FAILURE);
- }
+ if (!check_huge_anon(one_page, 4, pmd_pagesize))
+ ksft_exit_fail_msg("No THP is allocated\n");
/* split all THPs */
write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
(uint64_t)one_page + len);
for (i = 0; i < len; i++)
- if (one_page[i] != (char)i) {
- printf("%ld byte corrupted\n", i);
- exit(EXIT_FAILURE);
- }
+ if (one_page[i] != (char)i)
+ ksft_exit_fail_msg("%ld byte corrupted\n", i);
- if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
- printf("Still AnonHugePages not split\n");
- exit(EXIT_FAILURE);
- }
+ if (!check_huge_anon(one_page, 0, pmd_pagesize))
+ ksft_exit_fail_msg("Still AnonHugePages not split\n");
- printf("Split huge pages successful\n");
+ ksft_test_result_pass("Split huge pages successful\n");
free(one_page);
}
@@ -143,36 +128,29 @@ void split_pte_mapped_thp(void)
int pagemap_fd;
int kpageflags_fd;
- if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
- perror("get pagemap proc error");
- exit(EXIT_FAILURE);
- }
- pagemap_fd = open(pagemap_proc, O_RDONLY);
+ if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0)
+ ksft_exit_fail_msg("get pagemap proc error: %s\n", strerror(errno));
- if (pagemap_fd == -1) {
- perror("read pagemap:");
- exit(EXIT_FAILURE);
- }
+ pagemap_fd = open(pagemap_proc, O_RDONLY);
+ if (pagemap_fd == -1)
+ ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno));
kpageflags_fd = open(kpageflags_proc, O_RDONLY);
-
- if (kpageflags_fd == -1) {
- perror("read kpageflags:");
- exit(EXIT_FAILURE);
- }
+ if (kpageflags_fd == -1)
+ ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno));
one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (one_page == MAP_FAILED)
+ ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
madvise(one_page, len, MADV_HUGEPAGE);
for (i = 0; i < len; i++)
one_page[i] = (char)i;
- if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
- printf("No THP is allocated\n");
- exit(EXIT_FAILURE);
- }
+ if (!check_huge_anon(one_page, 4, pmd_pagesize))
+ ksft_exit_fail_msg("No THP is allocated\n");
/* remap the first pagesize of first THP */
pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
@@ -183,10 +161,8 @@ void split_pte_mapped_thp(void)
pagesize, pagesize,
MREMAP_MAYMOVE|MREMAP_FIXED,
pte_mapped + pagesize * i);
- if (pte_mapped2 == (char *)-1) {
- perror("mremap failed");
- exit(EXIT_FAILURE);
- }
+ if (pte_mapped2 == MAP_FAILED)
+ ksft_exit_fail_msg("mremap failed: %s\n", strerror(errno));
}
/* smap does not show THPs after mremap, use kpageflags instead */
@@ -196,10 +172,8 @@ void split_pte_mapped_thp(void)
is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
thp_size++;
- if (thp_size != 4) {
- printf("Some THPs are missing during mremap\n");
- exit(EXIT_FAILURE);
- }
+ if (thp_size != 4)
+ ksft_exit_fail_msg("Some THPs are missing during mremap\n");
/* split all remapped THPs */
write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
@@ -208,21 +182,18 @@ void split_pte_mapped_thp(void)
/* smap does not show THPs after mremap, use kpageflags instead */
thp_size = 0;
for (i = 0; i < pagesize * 4; i++) {
- if (pte_mapped[i] != (char)i) {
- printf("%ld byte corrupted\n", i);
- exit(EXIT_FAILURE);
- }
+ if (pte_mapped[i] != (char)i)
+ ksft_exit_fail_msg("%ld byte corrupted\n", i);
+
if (i % pagesize == 0 &&
is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
thp_size++;
}
- if (thp_size) {
- printf("Still %ld THPs not split\n", thp_size);
- exit(EXIT_FAILURE);
- }
+ if (thp_size)
+ ksft_exit_fail_msg("Still %ld THPs not split\n", thp_size);
- printf("Split PTE-mapped huge pages successful\n");
+ ksft_test_result_pass("Split PTE-mapped huge pages successful\n");
munmap(one_page, len);
close(pagemap_fd);
close(kpageflags_fd);
@@ -238,24 +209,21 @@ void split_file_backed_thp(void)
char testfile[INPUT_MAX];
uint64_t pgoff_start = 0, pgoff_end = 1024;
- printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n");
+ ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n");
status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
- if (status) {
- printf("Unable to create a tmpfs for testing\n");
- exit(EXIT_FAILURE);
- }
+ if (status)
+ ksft_exit_fail_msg("Unable to create a tmpfs for testing\n");
status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
if (status >= INPUT_MAX) {
- printf("Fail to create file-backed THP split testing file\n");
- goto cleanup;
+ ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n");
}
fd = open(testfile, O_CREAT|O_WRONLY);
if (fd == -1) {
- perror("Cannot open testing file\n");
+ ksft_perror("Cannot open testing file");
goto cleanup;
}
@@ -264,7 +232,7 @@ void split_file_backed_thp(void)
close(fd);
if (num_written < 1) {
- printf("Fail to write data to testing file\n");
+ ksft_perror("Fail to write data to testing file");
goto cleanup;
}
@@ -272,42 +240,51 @@ void split_file_backed_thp(void)
write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end);
status = unlink(testfile);
- if (status)
- perror("Cannot remove testing file\n");
+ if (status) {
+ ksft_perror("Cannot remove testing file");
+ goto cleanup;
+ }
-cleanup:
status = umount(tmpfs_loc);
if (status) {
- printf("Unable to umount %s\n", tmpfs_loc);
- exit(EXIT_FAILURE);
+ rmdir(tmpfs_loc);
+ ksft_exit_fail_msg("Unable to umount %s\n", tmpfs_loc);
}
+
status = rmdir(tmpfs_loc);
- if (status) {
- perror("cannot remove tmp dir");
- exit(EXIT_FAILURE);
- }
+ if (status)
+ ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno));
- printf("file-backed THP split test done, please check dmesg for more information\n");
+ ksft_print_msg("Please check dmesg for more information\n");
+ ksft_test_result_pass("File-backed THP split test done\n");
+ return;
+
+cleanup:
+ umount(tmpfs_loc);
+ rmdir(tmpfs_loc);
+ ksft_exit_fail_msg("Error occurred\n");
}
int main(int argc, char **argv)
{
+ ksft_print_header();
+
if (geteuid() != 0) {
- printf("Please run the benchmark as root\n");
- exit(EXIT_FAILURE);
+ ksft_print_msg("Please run the benchmark as root\n");
+ ksft_finished();
}
+ ksft_set_plan(3);
+
pagesize = getpagesize();
pageshift = ffs(pagesize) - 1;
pmd_pagesize = read_pmd_pagesize();
- if (!pmd_pagesize) {
- printf("Reading PMD pagesize failed\n");
- exit(EXIT_FAILURE);
- }
+ if (!pmd_pagesize)
+ ksft_exit_fail_msg("Reading PMD pagesize failed\n");
split_pmd_thp();
split_pte_mapped_thp();
split_file_backed_thp();
- return 0;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index 622987f12c89..ea7fd8fe2876 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -4,7 +4,7 @@
Before running this huge pages for each huge page size must have been
reserved.
For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must
- be used.
+ be used. 1GB wouldn't be tested if it isn't available.
Also shmmax must be increased.
And you need to run as root to work around some weird permissions in shm.
And nothing using huge pages should run in parallel.
@@ -26,8 +26,7 @@
#include <stdarg.h>
#include <string.h>
#include "vm_util.h"
-
-#define err(x) perror(x), exit(1)
+#include "../kselftest.h"
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
@@ -44,11 +43,8 @@
#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
#define NUM_PAGESIZES 5
-
#define NUM_PAGES 4
-#define Dprintf(fmt...) // printf(fmt)
-
unsigned long page_sizes[NUM_PAGESIZES];
int num_page_sizes;
@@ -60,28 +56,15 @@ int ilog2(unsigned long v)
return l;
}
-void find_pagesizes(void)
-{
- glob_t g;
- int i;
- glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
- assert(g.gl_pathc <= NUM_PAGESIZES);
- for (i = 0; i < g.gl_pathc; i++) {
- sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
- &page_sizes[i]);
- page_sizes[i] <<= 10;
- printf("Found %luMB\n", page_sizes[i] >> 20);
- }
- num_page_sizes = g.gl_pathc;
- globfree(&g);
-}
-
void show(unsigned long ps)
{
char buf[100];
+
if (ps == getpagesize())
return;
- printf("%luMB: ", ps >> 20);
+
+ ksft_print_msg("%luMB: ", ps >> 20);
+
fflush(stdout);
snprintf(buf, sizeof buf,
"cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
@@ -105,7 +88,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
f = fopen(buf, "r");
if (!f) {
if (warn)
- printf("missing %s\n", buf);
+ ksft_print_msg("missing %s\n", buf);
return 0;
}
if (getline(&line, &linelen, f) > 0) {
@@ -119,123 +102,143 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
unsigned long read_free(unsigned long ps)
{
return read_sysfs(ps != getpagesize(),
- "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
- ps >> 10);
+ "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+ ps >> 10);
}
void test_mmap(unsigned long size, unsigned flags)
{
char *map;
unsigned long before, after;
- int err;
before = read_free(size);
map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
- if (map == (char *)-1) err("mmap");
memset(map, 0xff, size*NUM_PAGES);
after = read_free(size);
- Dprintf("before %lu after %lu diff %ld size %lu\n",
- before, after, before - after, size);
- assert(size == getpagesize() || (before - after) == NUM_PAGES);
+
show(size);
- err = munmap(map, size * NUM_PAGES);
- assert(!err);
+ ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
+ "%s mmap\n", __func__);
+
+ if (munmap(map, size * NUM_PAGES))
+ ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno));
}
void test_shmget(unsigned long size, unsigned flags)
{
int id;
unsigned long before, after;
- int err;
+ struct shm_info i;
+ char *map;
before = read_free(size);
id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
- if (id < 0) err("shmget");
-
- struct shm_info i;
- if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
- Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
+ if (id < 0) {
+ if (errno == EPERM) {
+ ksft_test_result_skip("shmget requires root privileges: %s\n",
+ strerror(errno));
+ return;
+ }
+ ksft_exit_fail_msg("shmget: %s\n", strerror(errno));
+ }
+ if (shmctl(id, SHM_INFO, (void *)&i) < 0)
+ ksft_exit_fail_msg("shmctl: %s\n", strerror(errno));
- Dprintf("id %d\n", id);
- char *map = shmat(id, NULL, 0600);
- if (map == (char*)-1) err("shmat");
+ map = shmat(id, NULL, 0600);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("shmat: %s\n", strerror(errno));
shmctl(id, IPC_RMID, NULL);
memset(map, 0xff, size*NUM_PAGES);
after = read_free(size);
- Dprintf("before %lu after %lu diff %ld size %lu\n",
- before, after, before - after, size);
- assert(size == getpagesize() || (before - after) == NUM_PAGES);
show(size);
- err = shmdt(map);
- assert(!err);
+ ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
+ "%s: mmap\n", __func__);
+ if (shmdt(map))
+ ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno));
}
-void sanity_checks(void)
+void find_pagesizes(void)
{
- int i;
unsigned long largest = getpagesize();
+ int i;
+ glob_t g;
- for (i = 0; i < num_page_sizes; i++) {
- if (page_sizes[i] > largest)
+ glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
+ assert(g.gl_pathc <= NUM_PAGESIZES);
+ for (i = 0; (i < g.gl_pathc) && (num_page_sizes < NUM_PAGESIZES); i++) {
+ sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
+ &page_sizes[num_page_sizes]);
+ page_sizes[num_page_sizes] <<= 10;
+ ksft_print_msg("Found %luMB\n", page_sizes[i] >> 20);
+
+ if (page_sizes[num_page_sizes] > largest)
largest = page_sizes[i];
- if (read_free(page_sizes[i]) < NUM_PAGES) {
- printf("Not enough huge pages for page size %lu MB, need %u\n",
- page_sizes[i] >> 20,
- NUM_PAGES);
- exit(0);
- }
+ if (read_free(page_sizes[num_page_sizes]) >= NUM_PAGES)
+ num_page_sizes++;
+ else
+ ksft_print_msg("SKIP for size %lu MB as not enough huge pages, need %u\n",
+ page_sizes[num_page_sizes] >> 20, NUM_PAGES);
}
+ globfree(&g);
- if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
- printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
- exit(0);
- }
+ if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
+ ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax",
+ largest * NUM_PAGES);
#if defined(__x86_64__)
if (largest != 1U<<30) {
- printf("No GB pages available on x86-64\n"
- "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
- exit(0);
+ ksft_exit_fail_msg("No GB pages available on x86-64\n"
+ "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
}
#endif
}
int main(void)
{
- int i;
unsigned default_hps = default_huge_page_size();
+ int i;
+
+ ksft_print_header();
find_pagesizes();
- sanity_checks();
+ if (!num_page_sizes)
+ ksft_finished();
+
+ ksft_set_plan(2 * num_page_sizes + 3);
for (i = 0; i < num_page_sizes; i++) {
unsigned long ps = page_sizes[i];
int arg = ilog2(ps) << MAP_HUGE_SHIFT;
- printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
+
+ ksft_print_msg("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
test_mmap(ps, MAP_HUGETLB | arg);
}
- printf("Testing default huge mmap\n");
+
+ ksft_print_msg("Testing default huge mmap\n");
test_mmap(default_hps, MAP_HUGETLB);
- puts("Testing non-huge shmget");
+ ksft_print_msg("Testing non-huge shmget\n");
test_shmget(getpagesize(), 0);
for (i = 0; i < num_page_sizes; i++) {
unsigned long ps = page_sizes[i];
int arg = ilog2(ps) << SHM_HUGE_SHIFT;
- printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
+ ksft_print_msg("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
test_shmget(ps, SHM_HUGETLB | arg);
}
- puts("default huge shmget");
+
+ ksft_print_msg("default huge shmget\n");
test_shmget(default_hps, SHM_HUGETLB);
- return 0;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
index c61fb9350b8c..68201192e37c 100644
--- a/tools/testing/selftests/mm/transhuge-stress.c
+++ b/tools/testing/selftests/mm/transhuge-stress.c
@@ -16,6 +16,7 @@
#include <string.h>
#include <sys/mman.h>
#include "vm_util.h"
+#include "../kselftest.h"
int backing_fd = -1;
int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
@@ -34,6 +35,8 @@ int main(int argc, char **argv)
int pagemap_fd;
int duration = 0;
+ ksft_print_header();
+
ram = sysconf(_SC_PHYS_PAGES);
if (ram > SIZE_MAX / psize() / 4)
ram = SIZE_MAX / 4;
@@ -43,7 +46,8 @@ int main(int argc, char **argv)
while (++i < argc) {
if (!strcmp(argv[i], "-h"))
- errx(1, "usage: %s [-f <filename>] [-d <duration>] [size in MiB]", argv[0]);
+ ksft_exit_fail_msg("usage: %s [-f <filename>] [-d <duration>] [size in MiB]\n",
+ argv[0]);
else if (!strcmp(argv[i], "-f"))
name = argv[++i];
else if (!strcmp(argv[i], "-d"))
@@ -52,10 +56,12 @@ int main(int argc, char **argv)
len = atoll(argv[i]) << 20;
}
+ ksft_set_plan(1);
+
if (name) {
backing_fd = open(name, O_RDWR);
if (backing_fd == -1)
- errx(2, "open %s", name);
+ ksft_exit_fail_msg("open %s\n", name);
mmap_flags = MAP_SHARED;
}
@@ -65,21 +71,21 @@ int main(int argc, char **argv)
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
if (pagemap_fd < 0)
- err(2, "open pagemap");
+ ksft_exit_fail_msg("open pagemap\n");
len -= len % HPAGE_SIZE;
ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0);
if (ptr == MAP_FAILED)
- err(2, "initial mmap");
+ ksft_exit_fail_msg("initial mmap");
ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
if (madvise(ptr, len, MADV_HUGEPAGE))
- err(2, "MADV_HUGEPAGE");
+ ksft_exit_fail_msg("MADV_HUGEPAGE");
map_len = ram >> (HPAGE_SHIFT - 1);
map = malloc(map_len);
if (!map)
- errx(2, "map malloc");
+ ksft_exit_fail_msg("map malloc\n");
clock_gettime(CLOCK_MONOTONIC, &start);
@@ -103,7 +109,7 @@ int main(int argc, char **argv)
if (idx >= map_len) {
map = realloc(map, idx + 1);
if (!map)
- errx(2, "map realloc");
+ ksft_exit_fail_msg("map realloc\n");
memset(map + map_len, 0, idx + 1 - map_len);
map_len = idx + 1;
}
@@ -114,17 +120,19 @@ int main(int argc, char **argv)
/* split transhuge page, keep last page */
if (madvise(p, HPAGE_SIZE - psize(), MADV_DONTNEED))
- err(2, "MADV_DONTNEED");
+ ksft_exit_fail_msg("MADV_DONTNEED");
}
clock_gettime(CLOCK_MONOTONIC, &b);
s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
- warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
- "%4d succeed, %4d failed, %4d different pages",
- s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
- nr_succeed, nr_failed, nr_pages);
+ ksft_print_msg("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+ "%4d succeed, %4d failed, %4d different pages\n",
+ s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+ nr_succeed, nr_failed, nr_pages);
- if (duration > 0 && b.tv_sec - start.tv_sec >= duration)
- return 0;
+ if (duration > 0 && b.tv_sec - start.tv_sec >= duration) {
+ ksft_test_result_pass("Completed\n");
+ ksft_finished();
+ }
}
}
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index bae0ceaf95b1..7bcf8d48256a 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -12,6 +12,7 @@
#include <errno.h>
#include <sys/mman.h>
#include <sys/time.h>
+#include "../kselftest.h"
/*
* Maximum address range mapped with a single mmap()
@@ -68,23 +69,15 @@ static char *hind_addr(void)
return (char *) (1UL << bits);
}
-static int validate_addr(char *ptr, int high_addr)
+static void validate_addr(char *ptr, int high_addr)
{
unsigned long addr = (unsigned long) ptr;
- if (high_addr) {
- if (addr < HIGH_ADDR_MARK) {
- printf("Bad address %lx\n", addr);
- return 1;
- }
- return 0;
- }
+ if (high_addr && addr < HIGH_ADDR_MARK)
+ ksft_exit_fail_msg("Bad address %lx\n", addr);
- if (addr > HIGH_ADDR_MARK) {
- printf("Bad address %lx\n", addr);
- return 1;
- }
- return 0;
+ if (addr > HIGH_ADDR_MARK)
+ ksft_exit_fail_msg("Bad address %lx\n", addr);
}
static int validate_lower_address_hint(void)
@@ -107,23 +100,29 @@ int main(int argc, char *argv[])
char *hint;
unsigned long i, lchunks, hchunks;
+ ksft_print_header();
+ ksft_set_plan(1);
+
for (i = 0; i < NR_CHUNKS_LOW; i++) {
ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (ptr[i] == MAP_FAILED) {
- if (validate_lower_address_hint())
- return 1;
+ if (validate_lower_address_hint()) {
+ ksft_test_result_skip("Memory constraint not fulfilled\n");
+ ksft_finished();
+ }
break;
}
- if (validate_addr(ptr[i], 0))
- return 1;
+ validate_addr(ptr[i], 0);
}
lchunks = i;
hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *));
- if (hptr == NULL)
- return 1;
+ if (hptr == NULL) {
+ ksft_test_result_skip("Memory constraint not fulfilled\n");
+ ksft_finished();
+ }
for (i = 0; i < NR_CHUNKS_HIGH; i++) {
hint = hind_addr();
@@ -133,8 +132,7 @@ int main(int argc, char *argv[])
if (hptr[i] == MAP_FAILED)
break;
- if (validate_addr(hptr[i], 1))
- return 1;
+ validate_addr(hptr[i], 1);
}
hchunks = i;
@@ -145,5 +143,7 @@ int main(int argc, char *argv[])
munmap(hptr[i], MAP_CHUNK_SIZE);
free(hptr);
- return 0;
+
+ ksft_test_result_pass("Test\n");
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 05736c615734..5a62530da3b5 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -232,17 +232,17 @@ int64_t allocate_transhuge(void *ptr, int pagemap_fd)
if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_FIXED | MAP_ANONYMOUS |
MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
- errx(2, "mmap transhuge");
+ ksft_exit_fail_msg("mmap transhuge\n");
if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
- err(2, "MADV_HUGEPAGE");
+ ksft_exit_fail_msg("MADV_HUGEPAGE\n");
/* allocate transparent huge page */
*(volatile void **)ptr = ptr;
if (pread(pagemap_fd, ent, sizeof(ent),
(uintptr_t)ptr >> (pshift() - 3)) != sizeof(ent))
- err(2, "read pagemap");
+ ksft_exit_fail_msg("read pagemap\n");
if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&