diff options
Diffstat (limited to 'include/linux')
-rw-r--r-- | include/linux/buffer_head.h | 48 | ||||
-rw-r--r-- | include/linux/gfp.h | 3 | ||||
-rw-r--r-- | include/linux/huge_mm.h | 23 | ||||
-rw-r--r-- | include/linux/hugetlb.h | 4 | ||||
-rw-r--r-- | include/linux/hugetlb_cgroup.h | 19 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 7 | ||||
-rw-r--r-- | include/linux/memory_hotplug.h | 22 | ||||
-rw-r--r-- | include/linux/mempolicy.h | 13 | ||||
-rw-r--r-- | include/linux/mm.h | 35 | ||||
-rw-r--r-- | include/linux/mmzone.h | 28 | ||||
-rw-r--r-- | include/linux/page_counter.h | 35 | ||||
-rw-r--r-- | include/linux/page_ext.h | 24 | ||||
-rw-r--r-- | include/linux/page_idle.h | 34 | ||||
-rw-r--r-- | include/linux/pagemap.h | 4 | ||||
-rw-r--r-- | include/linux/pagewalk.h | 10 | ||||
-rw-r--r-- | include/linux/pgtable.h | 9 | ||||
-rw-r--r-- | include/linux/rmap.h | 66 | ||||
-rw-r--r-- | include/linux/sched/sysctl.h | 1 | ||||
-rw-r--r-- | include/linux/slab.h | 6 |
19 files changed, 290 insertions, 101 deletions
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 089c9ade4325..9b6556d3f110 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -214,8 +214,6 @@ struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); -void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, - gfp_t gfp); struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); @@ -225,7 +223,6 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); void __lock_buffer(struct buffer_head *bh); -void ll_rw_block(blk_opf_t, int, struct buffer_head * bh[]); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); @@ -233,7 +230,9 @@ int submit_bh(blk_opf_t, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); -int bh_submit_read(struct buffer_head *bh); +int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); +void __bh_read_batch(int nr, struct buffer_head *bhs[], + blk_opf_t op_flags, bool force_lock); extern int buffer_heads_over_limit; @@ -340,12 +339,6 @@ sb_breadahead(struct super_block *sb, sector_t block) __breadahead(sb->s_bdev, block, sb->s_blocksize); } -static inline void -sb_breadahead_unmovable(struct super_block *sb, sector_t block) -{ - __breadahead_gfp(sb->s_bdev, block, sb->s_blocksize, 0); -} - static inline struct buffer_head * sb_getblk(struct super_block *sb, sector_t block) { @@ -407,6 +400,41 @@ static inline struct buffer_head *__getblk(struct block_device *bdev, return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); } +static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags) +{ + if (!buffer_uptodate(bh) && trylock_buffer(bh)) { + if (!buffer_uptodate(bh)) + __bh_read(bh, op_flags, false); + else + unlock_buffer(bh); + } +} + +static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags) +{ + if (!bh_uptodate_or_lock(bh)) + __bh_read(bh, op_flags, false); +} + +/* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */ +static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags) +{ + if (bh_uptodate_or_lock(bh)) + return 1; + return __bh_read(bh, op_flags, true); +} + +static inline void bh_read_batch(int nr, struct buffer_head *bhs[]) +{ + __bh_read_batch(nr, bhs, 0, true); +} + +static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[], + blk_opf_t op_flags) +{ + __bh_read_batch(nr, bhs, op_flags, false); +} + /** * __bread() - reads a specified block and returns the bh * @bdev: the block_device to read from diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f314be58fa77..ea6cb9399152 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -18,6 +18,9 @@ static inline int gfp_migratetype(const gfp_t gfp_flags) VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); + BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE); + BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >> + GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 768e5261fdae..38265f9f782e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf); +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -219,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); +int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); @@ -321,8 +323,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, } static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs) { return false; } @@ -362,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma, static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { - BUG(); - return 0; + return -EINVAL; } + +static inline int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + return -EINVAL; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3ec981a0d8b3..57e72954a482 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1123,14 +1123,10 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h, #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) extern void __init hugetlb_cma_reserve(int order); -extern void __init hugetlb_cma_check(void); #else static inline __init void hugetlb_cma_reserve(int order) { } -static inline __init void hugetlb_cma_check(void) -{ -} #endif bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 379344828e78..630cd255d0cf 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -90,32 +90,31 @@ hugetlb_cgroup_from_page_rsvd(struct page *page) return __hugetlb_cgroup_from_page(page, true); } -static inline int __set_hugetlb_cgroup(struct page *page, +static inline void __set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg, bool rsvd) { VM_BUG_ON_PAGE(!PageHuge(page), page); if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) - return -1; + return; if (rsvd) set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD, (unsigned long)h_cg); else set_page_private(page + SUBPAGE_INDEX_CGROUP, (unsigned long)h_cg); - return 0; } -static inline int set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) { - return __set_hugetlb_cgroup(page, h_cg, false); + __set_hugetlb_cgroup(page, h_cg, false); } -static inline int set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct page *page, struct hugetlb_cgroup *h_cg) { - return __set_hugetlb_cgroup(page, h_cg, true); + __set_hugetlb_cgroup(page, h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) @@ -199,16 +198,14 @@ hugetlb_cgroup_from_page_rsvd(struct page *page) return NULL; } -static inline int set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) { - return 0; } -static inline int set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct page *page, struct hugetlb_cgroup *h_cg) { - return 0; } static inline bool hugetlb_cgroup_disabled(void) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6257867fbf95..a2461f9a8738 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -354,10 +354,11 @@ struct mem_cgroup { }; /* - * size of first charge trial. "32" comes from vmscan.c's magic value. - * TODO: maybe necessary to use big numbers in big irons. + * size of first charge trial. + * TODO: maybe necessary to use big numbers in big irons or dynamic based of the + * workload. */ -#define MEMCG_CHARGE_BATCH 32U +#define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e0b2209ab71c..51052969dbfe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -11,7 +11,6 @@ struct page; struct zone; struct pglist_data; struct mem_section; -struct memory_block; struct memory_group; struct resource; struct vmem_altmap; @@ -216,6 +215,22 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); +/* See kswapd_is_running() */ +static inline void pgdat_kswapd_lock(pg_data_t *pgdat) +{ + mutex_lock(&pgdat->kswapd_lock); +} + +static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) +{ + mutex_unlock(&pgdat->kswapd_lock); +} + +static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) +{ + mutex_init(&pgdat->kswapd_lock); +} + #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ @@ -252,6 +267,10 @@ static inline bool movable_node_is_enabled(void) { return false; } + +static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {} +static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {} +static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {} #endif /* ! CONFIG_MEMORY_HOTPLUG */ /* @@ -333,7 +352,6 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); -extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 668389b4b53d..d232de7cdc56 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -151,13 +151,6 @@ extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) -{ - struct mempolicy *mpol = get_task_policy(current); - - return policy_nodemask(gfp, mpol); -} - extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; @@ -189,6 +182,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) return (pol->mode == MPOL_PREFERRED_MANY); } +extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); #else @@ -294,11 +288,6 @@ static inline void mpol_put_task_policy(struct task_struct *task) { } -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) -{ - return NULL; -} - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; diff --git a/include/linux/mm.h b/include/linux/mm.h index 21f8b27bd9fd..8a5ad9d050bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1255,6 +1255,18 @@ static inline int folio_nid(const struct folio *folio) } #ifdef CONFIG_NUMA_BALANCING +/* page access time bits needs to hold at least 4 seconds */ +#define PAGE_ACCESS_TIME_MIN_BITS 12 +#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS +#define PAGE_ACCESS_TIME_BUCKETS \ + (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) +#else +#define PAGE_ACCESS_TIME_BUCKETS 0 +#endif + +#define PAGE_ACCESS_TIME_MASK \ + (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) + static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); @@ -1318,12 +1330,25 @@ static inline void page_cpupid_reset_last(struct page *page) page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ + +static inline int xchg_page_access_time(struct page *page, int time) +{ + int last_time; + + last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS); + return last_time << PAGE_ACCESS_TIME_BUCKETS; +} #else /* !CONFIG_NUMA_BALANCING */ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { return page_to_nid(page); /* XXX */ } +static inline int xchg_page_access_time(struct page *page, int time) +{ + return 0; +} + static inline int page_cpupid_last(struct page *page) { return page_to_nid(page); /* XXX */ @@ -2495,7 +2520,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); -extern unsigned long find_min_pfn_with_active_regions(void); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) @@ -2975,8 +2999,8 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * PageAnonExclusive() has to protect against concurrent GUP: * * Ordinary GUP: Using the PT lock * * GUP-fast and fork(): mm->write_protect_seq - * * GUP-fast and KSM or temporary unmapping (swap, migration): - * clear/invalidate+flush of the page table entry + * * GUP-fast and KSM or temporary unmapping (swap, migration): see + * page_try_share_anon_rmap() * * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a @@ -2997,6 +3021,11 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page) */ if (!PageAnon(page)) return false; + + /* Paired with a memory barrier in page_try_share_anon_rmap(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_rmb(); + /* * Note that PageKsm() pages cannot be exclusive, and consequently, * cannot get pinned. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e24b40c52468..18cf0fc5ce67 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -24,10 +24,10 @@ #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ -#ifndef CONFIG_FORCE_MAX_ZONEORDER +#ifndef CONFIG_ARCH_FORCE_MAX_ORDER #define MAX_ORDER 11 #else -#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER +#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) @@ -221,6 +221,7 @@ enum node_stat_item { #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ + PGPROMOTE_CANDIDATE, /* candidate pages to promote */ #endif NR_VM_NODE_STAT_ITEMS }; @@ -306,6 +307,8 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } +#define WORKINGSET_ANON 0 +#define WORKINGSET_FILE 1 #define ANON_AND_FILE 2 enum lruvec_flags { @@ -953,8 +956,10 @@ typedef struct pglist_data { atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ - struct task_struct *kswapd; /* Protected by - mem_hotplug_begin/done() */ +#ifdef CONFIG_MEMORY_HOTPLUG + struct mutex kswapd_lock; +#endif + struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; @@ -996,6 +1001,21 @@ typedef struct pglist_data { struct deferred_split deferred_split_queue; #endif +#ifdef CONFIG_NUMA_BALANCING + /* start time in ms of current promote rate limit period */ + unsigned int nbp_rl_start; + /* number of promote candidate pages at start time of current rate limit period */ + unsigned long nbp_rl_nr_cand; + /* promote threshold in ms */ + unsigned int nbp_threshold; + /* start time in ms of current promote threshold adjustment period */ + unsigned int nbp_th_start; + /* + * number of promote candidate pages at stat time of current promote + * threshold adjustment period + */ + unsigned long nbp_th_nr_cand; +#endif /* Fields commonly accessed by the page reclaim scanner */ /* diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 679591301994..78a1c934e416 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -3,15 +3,26 @@ #define _LINUX_PAGE_COUNTER_H #include <linux/atomic.h> +#include <linux/cache.h> #include <linux/kernel.h> #include <asm/page.h> +#if defined(CONFIG_SMP) +struct pc_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define PC_PADDING(name) struct pc_padding name +#else +#define PC_PADDING(name) +#endif + struct page_counter { + /* + * Make sure 'usage' does not share cacheline with any other field. The + * memcg->memory.usage is a hot member of struct mem_cgroup. + */ atomic_long_t usage; - unsigned long min; - unsigned long low; - unsigned long high; - unsigned long max; + PC_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -23,18 +34,18 @@ struct page_counter { atomic_long_t low_usage; atomic_long_t children_low_usage; - /* legacy */ unsigned long watermark; unsigned long failcnt; - /* - * 'parent' is placed here to be far from 'usage' to reduce - * cache false sharing, as 'usage' is written mostly while - * parent is frequently read for cgroup's hierarchical - * counting nature. - */ + /* Keep all the read most fields in a separete cacheline. */ + PC_PADDING(_pad2_); + + unsigned long min; + unsigned long low; + unsigned long high; + unsigned long max; struct page_counter *parent; -}; +} ____cacheline_internodealigned_in_smp; #if BITS_PER_LONG == 32 #define PAGE_COUNTER_MAX LONG_MAX diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index fabb2e1e087f..22be4582faae 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -36,9 +36,15 @@ struct page_ext { unsigned long flags; }; +extern bool early_page_ext; extern unsigned long page_ext_size; extern void pgdat_page_ext_init(struct pglist_data *pgdat); +static inline bool early_page_ext_enabled(void) +{ + return early_page_ext; +} + #ifdef CONFIG_SPARSEMEM static inline void page_ext_init_flatmem(void) { @@ -55,7 +61,8 @@ static inline void page_ext_init(void) } #endif -struct page_ext *lookup_page_ext(const struct page *page); +extern struct page_ext *page_ext_get(struct page *page); +extern void page_ext_put(struct page_ext *page_ext); static inline struct page_ext *page_ext_next(struct page_ext *curr) { @@ -67,13 +74,13 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr) #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; -static inline void pgdat_page_ext_init(struct pglist_data *pgdat) +static inline bool early_page_ext_enabled(void) { + return false; } -static inline struct page_ext *lookup_page_ext(const struct page *page) +static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { - return NULL; } static inline void page_ext_init(void) @@ -87,5 +94,14 @@ static inline void page_ext_init_flatmem_late(void) static inline void page_ext_init_flatmem(void) { } + +static inline struct page_ext *page_ext_get(struct page *page) +{ + return NULL; +} + +static inline void page_ext_put(struct page_ext *page_ext) +{ +} #endif /* CONFIG_PAGE_EXTENSION */ #endif /* __LINUX_PAGE_EXT_H */ diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 4663dfed1293..5cb7bd2078ec 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -13,65 +13,79 @@ * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ - static inline bool folio_test_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline void folio_set_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); } static inline bool folio_test_clear_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline bool folio_test_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_idle; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); + + return page_idle; } static inline void folio_set_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } static inline void folio_clear_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } #endif /* !CONFIG_64BIT */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0178b2040ea3..09de43e36a64 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -718,8 +718,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); -unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, - unsigned int nr_pages, struct page **pages); +unsigned filemap_get_folios_contig(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index ac7b38ad5903..f3fafb731ffd 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -15,12 +15,12 @@ struct mm_walk; * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (lowest-level) - * entry + * @pte_entry: if set, called for each PTE (lowest-level) entry, + * including empty ones * @pte_hole: if set, called for each hole at all levels, - * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD - * 4:PTE. Any folded depths (where PTRS_PER_P?D is equal - * to 1) are skipped. + * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. + * Any folded depths (where PTRS_PER_P?D is equal to 1) + * are skipped. * @hugetlb_entry: if set, called for each hugetlb entry * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 014ee8f0fbaa..d13b4f7cc5be 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1276,8 +1276,7 @@ static inline int pgd_devmap(pgd_t pgd) #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ - (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ - !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; @@ -1598,11 +1597,7 @@ typedef unsigned int pgtbl_mod_mask; #endif #ifndef has_transparent_hugepage -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define has_transparent_hugepage() 1 -#else -#define has_transparent_hugepage() 0 -#endif +#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif /* diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf80adca980b..72b2bcc37f73 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -267,7 +267,7 @@ dup: * @page: the exclusive anonymous page to try marking possibly shared * * The caller needs to hold the PT lock and has to have the page table entry - * cleared/invalidated+flushed, to properly sync against GUP-fast. + * cleared/invalidated. * * This is similar to page_try_dup_anon_rmap(), however, not used during fork() * to duplicate a mapping, but instead to prepare for KSM or temporarily @@ -283,12 +283,68 @@ static inline int page_try_share_anon_rmap(struct page *page) { VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page); - /* See page_try_dup_anon_rmap(). */ - if (likely(!is_device_private_page(page) && - unlikely(page_maybe_dma_pinned(page)))) - return -EBUSY; + /* device private pages cannot get pinned via GUP. */ + if (unlikely(is_device_private_page(page))) { + ClearPageAnonExclusive(page); + return 0; + } + /* + * We have to make sure that when we clear PageAnonExclusive, that + * the page is not pinned and that concurrent GUP-fast won't succeed in + * concurrently pinning the page. + * + * Conceptually, PageAnonExclusive clearing consists of: + * (A1) Clear PTE + * (A2) Check if the page is pinned; back off if so. + * (A3) Clear PageAnonExclusive + * (A4) Restore PTE (optional, but certainly not writable) + * + * When clearing PageAnonExclusive, we cannot possibly map the page + * writable again, because anon pages that may be shared must never + * be writable. So in any case, if the PTE was writable it cannot + * be writable anymore afterwards and there would be a PTE change. Only + * if the PTE wasn't writable, there might not be a PTE change. + * + * Conceptually, GUP-fast pinning of an anon page consists of: + * (B1) Read the PTE + * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so. + * (B3) Pin the mapped page + * (B4) Check if the PTE changed by re-reading it; back off if so. + * (B5) If the original PTE is not writable, check if + * PageAnonExclusive is not set; back off if so. + * + * If the PTE was writable, we only have to make sure that GUP-fast + * observes a PTE change and properly backs off. + * + * If the PTE was not writable, we have to make sure that GUP-fast either + * detects a (temporary) PTE change or that PageAnonExclusive is cleared + * and properly backs off. + * + * Consequently, when clearing PageAnonExclusive(), we have to make + * sure that (A1), (A2)/(A3) and (A4) happen in the right memory + * order. In GUP-fast pinning code, we have to make sure that (B3),(B4) + * and (B5) happen in the right memory order. + * + * We assume that there might not be a memory barrier after + * clearing/invalidating the PTE (A1) and before restoring the PTE (A4), + * so we use explicit ones here. + */ + + /* Paired with the memory barrier in try_grab_folio(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb(); + + if (unlikely(page_maybe_dma_pinned(page))) + return -EBUSY; ClearPageAnonExclusive(page); + + /* + * This is conceptually a smp_wmb() paired with the smp_rmb() in + * gup_must_unshare(). + */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb__after_atomic(); return 0; } diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index e650946816d0..303ee7dd0c7e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -27,6 +27,7 @@ enum sched_tunable_scaling { #ifdef CONFIG_NUMA_BALANCING extern int sysctl_numa_balancing_mode; +extern unsigned int sysctl_numa_balancing_promote_rate_limit; #else #define sysctl_numa_balancing_mode 0 #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..352e3f082acc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -119,6 +119,12 @@ */ #define SLAB_NO_USER_FLAGS ((slab_flags_t __force)0x10000000U) +#ifdef CONFIG_KFENCE +#define SLAB_SKIP_KFENCE ((slab_flags_t __force)0x20000000U) +#else +#define SLAB_SKIP_KFENCE 0 +#endif + /* The following flags affect the page allocator grouping pages by mobility */ /* Objects are reclaimable */ #define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) |