summaryrefslogtreecommitdiff
path: root/include/linux/mm.h
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-28 05:42:02 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-28 05:42:02 +0300
commit7fa8a8ee9400fe8ec188426e40e481717bc5e924 (patch)
treecc8fd6b4f936ec01e73238643757451e20478c07 /include/linux/mm.h
parent91ec4b0d11fe115581ce2835300558802ce55e6c (diff)
parent4d4b6d66db63ceed399f1fb1a4b24081d2590eb1 (diff)
downloadlinux-7fa8a8ee9400fe8ec188426e40e481717bc5e924.tar.xz
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of switching from a user process to a kernel thread. - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav. - zsmalloc performance improvements from Sergey Senozhatsky. - Yue Zhao has found and fixed some data race issues around the alteration of memcg userspace tunables. - VFS rationalizations from Christoph Hellwig: - removal of most of the callers of write_one_page() - make __filemap_get_folio()'s return value more useful - Luis Chamberlain has changed tmpfs so it no longer requires swap backing. Use `mount -o noswap'. - Qi Zheng has made the slab shrinkers operate locklessly, providing some scalability benefits. - Keith Busch has improved dmapool's performance, making part of its operations O(1) rather than O(n). - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd, permitting userspace to wr-protect anon memory unpopulated ptes. - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather than exclusive, and has fixed a bunch of errors which were caused by its unintuitive meaning. - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature, which causes minor faults to install a write-protected pte. - Vlastimil Babka has done some maintenance work on vma_merge(): cleanups to the kernel code and improvements to our userspace test harness. - Cleanups to do_fault_around() by Lorenzo Stoakes. - Mike Rapoport has moved a lot of initialization code out of various mm/ files and into mm/mm_init.c. - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for DRM, but DRM doesn't use it any more. - Lorenzo has also coverted read_kcore() and vread() to use iterators and has thereby removed the use of bounce buffers in some cases. - Lorenzo has also contributed further cleanups of vma_merge(). - Chaitanya Prakash provides some fixes to the mmap selftesting code. - Matthew Wilcox changes xfs and afs so they no longer take sleeping locks in ->map_page(), a step towards RCUification of pagefaults. - Suren Baghdasaryan has improved mmap_lock scalability by switching to per-VMA locking. - Frederic Weisbecker has reworked the percpu cache draining so that it no longer causes latency glitches on cpu isolated workloads. - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig logic. - Liu Shixin has changed zswap's initialization so we no longer waste a chunk of memory if zswap is not being used. - Yosry Ahmed has improved the performance of memcg statistics flushing. - David Stevens has fixed several issues involving khugepaged, userfaultfd and shmem. - Christoph Hellwig has provided some cleanup work to zram's IO-related code paths. - David Hildenbrand has fixed up some issues in the selftest code's testing of our pte state changing. - Pankaj Raghav has made page_endio() unneeded and has removed it. - Peter Xu contributed some rationalizations of the userfaultfd selftests. - Yosry Ahmed has fixed an issue around memcg's page recalim accounting. - Chaitanya Prakash has fixed some arm-related issues in the selftests/mm code. - Longlong Xia has improved the way in which KSM handles hwpoisoned pages. - Peter Xu fixes a few issues with uffd-wp at fork() time. - Stefan Roesch has changed KSM so that it may now be used on a per-process and per-cgroup basis. * tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits) mm,unmap: avoid flushing TLB in batch if PTE is inaccessible shmem: restrict noswap option to initial user namespace mm/khugepaged: fix conflicting mods to collapse_file() sparse: remove unnecessary 0 values from rc mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() hugetlb: pte_alloc_huge() to replace huge pte_alloc_map() maple_tree: fix allocation in mas_sparse_area() mm: do not increment pgfault stats when page fault handler retries zsmalloc: allow only one active pool compaction context selftests/mm: add new selftests for KSM mm: add new KSM process and sysfs knobs mm: add new api to enable ksm per process mm: shrinkers: fix debugfs file permissions mm: don't check VMA write permissions if the PTE/PMD indicates write permissions migrate_pages_batch: fix statistics for longterm pin retry userfaultfd: use helper function range_in_vma() lib/show_mem.c: use for_each_populated_zone() simplify code mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list() fs/buffer: convert create_page_buffers to folio_create_buffers fs/buffer: add folio_create_empty_buffers helper ...
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h206
1 files changed, 181 insertions, 25 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 98da268b834a..3731999cd9f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -29,6 +29,7 @@
#include <linux/pgtable.h>
#include <linux/kasan.h>
#include <linux/memremap.h>
+#include <linux/slab.h>
struct mempolicy;
struct anon_vma;
@@ -38,6 +39,7 @@ struct pt_regs;
extern int sysctl_page_lock_unfairness;
+void mm_core_init(void);
void init_mm_internals(void);
#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */
@@ -256,6 +258,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
+/* Use only if VMA has no other users */
+void __vm_area_free(struct vm_area_struct *vma);
#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
@@ -478,7 +482,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{ FAULT_FLAG_USER, "USER" }, \
{ FAULT_FLAG_REMOTE, "REMOTE" }, \
{ FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
- { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }
+ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \
+ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" }
/*
* vm_fault is filled by the pagefault handler and passed to the vma's
@@ -623,6 +628,131 @@ struct vm_operations_struct {
unsigned long addr);
};
+#ifdef CONFIG_NUMA_BALANCING
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+ vma->numab_state = NULL;
+}
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
+{
+ kfree(vma->numab_state);
+}
+#else
+static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
+static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+ /* Check before locking. A race might cause false locked result. */
+ if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ return false;
+
+ if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+ return false;
+
+ /*
+ * Overflow might produce false locked result.
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
+ */
+ if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+ up_read(&vma->vm_lock->lock);
+ return false;
+ }
+ return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+ rcu_read_lock(); /* keeps vma alive till the end of up_read */
+ up_read(&vma->vm_lock->lock);
+ rcu_read_unlock();
+}
+
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+
+ /*
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ * mm->mm_lock_seq can't be concurrently modified.
+ */
+ *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+ return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
+ return;
+
+ down_write(&vma->vm_lock->lock);
+ vma->vm_lock_seq = mm_lock_seq;
+ up_write(&vma->vm_lock->lock);
+}
+
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
+ return true;
+
+ if (!down_write_trylock(&vma->vm_lock->lock))
+ return false;
+
+ vma->vm_lock_seq = mm_lock_seq;
+ up_write(&vma->vm_lock->lock);
+ return true;
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+{
+ /* When detaching vma should be write-locked */
+ if (detached)
+ vma_assert_write_locked(vma);
+ vma->detached = detached;
+}
+
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address);
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+ { return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+ { return true; }
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma,
+ bool detached) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
@@ -631,6 +761,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
+ vma_mark_detached(vma, false);
+ vma_numab_state_init(vma);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
@@ -644,28 +776,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
vm_flags_init(vma, flags);
}
static inline void vm_flags_reset_once(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}
static inline void vm_flags_set(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
ACCESS_PRIVATE(vma, __vm_flags) |= flags;
}
static inline void vm_flags_clear(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
}
@@ -686,7 +818,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
static inline void vm_flags_mod(struct vm_area_struct *vma,
vm_flags_t set, vm_flags_t clear)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
__vm_flags_mod(vma, set, clear);
}
@@ -1554,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
return last_time << PAGE_ACCESS_TIME_BUCKETS;
}
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+ unsigned int pid_bit;
+
+ pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+ if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
+ __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+ }
+}
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
@@ -1603,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
return false;
}
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -2636,12 +2782,6 @@ static inline bool ptlock_init(struct page *page) { return true; }
static inline void ptlock_free(struct page *page) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */
-static inline void pgtable_init(void)
-{
- ptlock_cache_init();
- pgtable_cache_init();
-}
-
static inline bool pgtable_pte_page_ctor(struct page *page)
{
if (!ptlock_init(page))
@@ -2785,7 +2925,6 @@ extern unsigned long free_reserved_area(void *start, void *end,
int poison, const char *s);
extern void adjust_managed_page_count(struct page *page, long count);
-extern void mem_init_print_info(void);
extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
@@ -2896,7 +3035,6 @@ extern void setup_per_cpu_pageset(void);
extern int min_free_kbytes;
extern int watermark_boost_factor;
extern int watermark_scale_factor;
-extern bool arch_has_descending_max_zone_pfns(void);
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
@@ -3185,8 +3323,6 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn);
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
@@ -3256,7 +3392,6 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
unsigned long address, unsigned long size,
pte_fn_t fn, void *data);
-extern void __init init_mem_debugging_and_hardening(void);
#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
@@ -3425,6 +3560,22 @@ void vmemmap_populate_print_last(void);
void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap);
#endif
+
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
+{
+ return is_power_of_2(sizeof(struct page)) &&
+ pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+}
+#else
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
+{
+ return false;
+}
+#endif
+
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
unsigned long nr_pages);
@@ -3451,6 +3602,7 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
+struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
#else
static inline void memory_failure_queue(unsigned long pfn, int flags)
{
@@ -3471,6 +3623,12 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
}
#endif
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
+void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma, struct list_head *to_kill,
+ unsigned long ksm_addr);
+#endif
+
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
extern void memblk_nr_poison_inc(unsigned long pfn);
extern void memblk_nr_poison_sub(unsigned long pfn, long i);
@@ -3540,14 +3698,12 @@ extern const struct attribute_group memory_failure_attr_group;
extern void clear_huge_page(struct page *page,
unsigned long addr_hint,
unsigned int pages_per_huge_page);
-extern void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr_hint,
- struct vm_area_struct *vma,
- unsigned int pages_per_huge_page);
-extern long copy_huge_page_from_user(struct page *dst_page,
- const void __user *usr_src,
- unsigned int pages_per_huge_page,
- bool allow_pagefault);
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+ unsigned long addr_hint,
+ struct vm_area_struct *vma);
+long copy_folio_from_user(struct folio *dst_folio,
+ const void __user *usr_src,
+ bool allow_pagefault);
/**
* vma_is_special_huge - Are transhuge page-table entries considered special?