diff options
Diffstat (limited to 'mm')
65 files changed, 2981 insertions, 1761 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 356f4f2c779e..53d7485fc38f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -900,6 +900,20 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +config ANON_VMA_NAME + bool "Anonymous VMA name support" + depends on PROC_FS && ADVISE_SYSCALLS && MMU + + help + Allow naming anonymous virtual memory areas. + + This feature allows assigning names to virtual memory areas. Assigned + names can be later retrieved from /proc/pid/maps and /proc/pid/smaps + and help identifying individual anonymous memory areas. + Assigning a name to anonymous virtual memory area might prevent that + area from being merged with adjacent virtual memory areas due to the + difference in their name. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1e73717802f8..5bd5bb097252 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -62,6 +62,30 @@ config PAGE_OWNER If unsure, say N. +config PAGE_TABLE_CHECK + bool "Check for invalid mappings in user page tables" + depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK + select PAGE_EXTENSION + help + Check that anonymous page is not being mapped twice with read write + permissions. Check that anonymous and file pages are not being + erroneously shared. Since the checking is performed at the time + entries are added and removed to user page tables, leaking, corruption + and double mapping problems are detected synchronously. + + If unsure say "n". + +config PAGE_TABLE_CHECK_ENFORCED + bool "Enforce the page table checking by default" + depends on PAGE_TABLE_CHECK + help + Always enable page table checking. By default the page table checking + is disabled, and can be optionally enabled via page_table_check=on + kernel parameter. This config enforces that page table check is always + enabled. + + If unsure say "n". + config PAGE_POISONING bool "Poison pages after freeing" help diff --git a/mm/Makefile b/mm/Makefile index d6c0042e3aa0..588d3113f3b0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -15,6 +15,8 @@ KCSAN_SANITIZE_slab_common.o := n KCSAN_SANITIZE_slab.o := n KCSAN_SANITIZE_slub.o := n KCSAN_SANITIZE_page_alloc.o := n +# But enable explicit instrumentation for memory barriers. +KCSAN_INSTRUMENT_BARRIERS := y # These files are disabled because they produce non-interesting and/or # flaky coverage that is not a function of syscall inputs. E.g. slab is out of @@ -112,6 +114,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o diff --git a/mm/compaction.c b/mm/compaction.c index 6e446094ce90..b4e94cda3019 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2280,6 +2280,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; + unsigned int nr_succeeded = 0; /* * These counters track activities during zone compaction. Initialize @@ -2398,10 +2399,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION, NULL); + MR_COMPACTION, &nr_succeeded); - trace_mm_compaction_migratepages(cc->nr_migratepages, err, - &cc->migratepages); + trace_mm_compaction_migratepages(cc->nr_migratepages, + nr_succeeded); /* All pages were either migrated or will be released */ cc->nr_migratepages = 0; diff --git a/mm/damon/core.c b/mm/damon/core.c index e92497895202..1dd153c31c9e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -11,7 +11,6 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/mm.h> -#include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> @@ -23,9 +22,6 @@ #define DAMON_MIN_REGION 1 #endif -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) - static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; @@ -53,17 +49,6 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) return region; } -/* - * Add a region between two other regions - */ -inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next, - struct damon_target *t) -{ - __list_add(&r->list, &prev->list, &next->list); - t->nr_regions++; -} - void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); @@ -106,8 +91,7 @@ struct damos *damon_new_scheme( scheme->min_age_region = min_age_region; scheme->max_age_region = max_age_region; scheme->action = action; - scheme->stat_count = 0; - scheme->stat_sz = 0; + scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); scheme->quota.ms = quota->ms; @@ -530,15 +514,17 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) static void kdamond_reset_aggregated(struct damon_ctx *c) { struct damon_target *t; + unsigned int ti = 0; /* target's index */ damon_for_each_target(t, c) { struct damon_region *r; damon_for_each_region(r, t) { - trace_damon_aggregated(t, r, damon_nr_regions(t)); + trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } + ti++; } } @@ -578,6 +564,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, struct damos_quota *quota = &s->quota; unsigned long sz = r->ar.end - r->ar.start; struct timespec64 begin, end; + unsigned long sz_applied = 0; if (!s->wmarks.activated) continue; @@ -631,7 +618,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_split_region_at(c, t, r, sz); } ktime_get_coarse_ts64(&begin); - c->primitive.apply_scheme(c, t, r, s); + sz_applied = c->primitive.apply_scheme(c, t, r, s); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -645,8 +632,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c, r->age = 0; update_stat: - s->stat_count++; - s->stat_sz += sz; + s->stat.nr_tried++; + s->stat.sz_tried += sz; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; } } @@ -694,6 +684,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies( quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; @@ -733,7 +725,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -#define sz_damon_region(r) (r->ar.end - r->ar.start) +static inline unsigned long sz_damon_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} /* * Merge two adjacent regions into one region @@ -750,8 +745,6 @@ static void damon_merge_two_regions(struct damon_target *t, damon_destroy_region(r, t); } -#define diff_of(a, b) (a > b ? a - b : b - a) - /* * Merge adjacent regions having similar access frequencies * @@ -765,13 +758,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { - if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres) + if (abs(r->nr_accesses - r->last_nr_accesses) > thres) r->age = 0; else r->age++; if (prev && prev->ar.end == r->ar.start && - diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + abs(prev->nr_accesses - r->nr_accesses) <= thres && sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 1efac0022e9a..5b899601e56c 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n", + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, @@ -117,7 +117,9 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->quota.weight_age, s->wmarks.metric, s->wmarks.interval, s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat_count, s->stat_sz); + s->stat.nr_tried, s->stat.sz_tried, + s->stat.nr_applied, s->stat.sz_applied, + s->stat.qt_exceeds); if (!rc) return -ENOMEM; @@ -213,6 +215,13 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, if (!damos_action_valid(action)) goto fail; + if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + goto fail; + + if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || + wmarks.mid < wmarks.low) + goto fail; + pos += parsed; scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, min_age, max_age, action, "a, &wmarks); @@ -353,8 +362,9 @@ static ssize_t dbgfs_target_ids_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; + struct damon_target *t, *next_t; bool id_is_pid = true; - char *kbuf, *nrs; + char *kbuf; unsigned long *targets; ssize_t nr_targets; ssize_t ret; @@ -364,14 +374,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - nrs = kbuf; if (!strncmp(kbuf, "paddr\n", count)) { id_is_pid = false; /* target id is meaningless here, but we set it just for fun */ scnprintf(kbuf, count, "42 "); } - targets = str_to_target_ids(nrs, count, &nr_targets); + targets = str_to_target_ids(kbuf, count, &nr_targets); if (!targets) { ret = -ENOMEM; goto out; @@ -397,8 +406,12 @@ static ssize_t dbgfs_target_ids_write(struct file *file, goto unlock_out; } - /* remove targets with previously-set primitive */ - damon_set_targets(ctx, NULL, 0); + /* remove previously set targets */ + damon_for_each_target_safe(t, next_t, ctx) { + if (targetid_is_pid(ctx)) + put_pid((struct pid *)t->id); + damon_destroy_target(t); + } /* Configure the context for the address space type */ if (id_is_pid) @@ -650,10 +663,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx) if (!targetid_is_pid(ctx)) return; + mutex_lock(&ctx->kdamond_lock); damon_for_each_target_safe(t, next, ctx) { put_pid((struct pid *)t->id); damon_destroy_target(t); } + mutex_unlock(&ctx->kdamond_lock); } static struct damon_ctx *dbgfs_new_ctx(void) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index a496d6f203d6..5e8244f65a1a 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -73,7 +73,7 @@ static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, damon_pa_mkold(r->sampling_addr); } -void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -192,7 +192,7 @@ static void __damon_pa_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -213,14 +213,15 @@ bool damon_pa_target_valid(void *t) return true; } -int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { - unsigned long addr; + unsigned long addr, applied; LIST_HEAD(page_list); if (scheme->action != DAMOS_PAGEOUT) - return -EINVAL; + return 0; for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct page *page = damon_get_page(PHYS_PFN(addr)); @@ -241,13 +242,14 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, put_page(page); } } - reclaim_pages(&page_list); + applied = reclaim_pages(&page_list); cond_resched(); - return 0; + return applied * PAGE_SIZE; } -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_pa_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h index 61f27037603e..e790cb5f8fe0 100644 --- a/mm/damon/prmtv-common.h +++ b/mm/damon/prmtv-common.h @@ -6,10 +6,6 @@ */ #include <linux/damon.h> -#include <linux/random.h> - -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) struct page *damon_get_page(unsigned long pfn); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index dc1485044eaf..bc476cef688e 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -185,6 +185,36 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); +/* + * Number of memory regions that tried to be reclaimed. + */ +static unsigned long nr_reclaim_tried_regions __read_mostly; +module_param(nr_reclaim_tried_regions, ulong, 0400); + +/* + * Total bytes of memory regions that tried to be reclaimed. + */ +static unsigned long bytes_reclaim_tried_regions __read_mostly; +module_param(bytes_reclaim_tried_regions, ulong, 0400); + +/* + * Number of memory regions that successfully be reclaimed. + */ +static unsigned long nr_reclaimed_regions __read_mostly; +module_param(nr_reclaimed_regions, ulong, 0400); + +/* + * Total bytes of memory regions that successfully be reclaimed. + */ +static unsigned long bytes_reclaimed_regions __read_mostly; +module_param(bytes_reclaimed_regions, ulong, 0400); + +/* + * Number of times that the time/space quota limits have exceeded + */ +static unsigned long nr_quota_exceeds __read_mostly; +module_param(nr_quota_exceeds, ulong, 0400); + static struct damon_ctx *ctx; static struct damon_target *target; @@ -333,6 +363,21 @@ static void damon_reclaim_timer_fn(struct work_struct *work) } static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); +static int damon_reclaim_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + nr_reclaim_tried_regions = s->stat.nr_tried; + bytes_reclaim_tried_regions = s->stat.sz_tried; + nr_reclaimed_regions = s->stat.nr_applied; + bytes_reclaimed_regions = s->stat.sz_applied; + nr_quota_exceeds = s->stat.qt_exceeds; + } + return 0; +} + static int __init damon_reclaim_init(void) { ctx = damon_new_ctx(); @@ -340,6 +385,7 @@ static int __init damon_reclaim_init(void) return -ENOMEM; damon_pa_set_primitives(ctx); + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; /* 4242 means nothing but fun */ target = damon_new_target(4242); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 20a9a9d69eb1..89b6468da2b9 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -26,8 +26,10 @@ * 't->id' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ -#define damon_get_task_struct(t) \ - (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) +static inline struct task_struct *damon_get_task_struct(struct damon_target *t) +{ + return get_pid_task((struct pid *)t->id, PIDTYPE_PID); +} /* * Get the mm_struct of the given target @@ -98,16 +100,6 @@ static unsigned long sz_range(struct damon_addr_range *r) return r->end - r->start; } -static void swap_ranges(struct damon_addr_range *r1, - struct damon_addr_range *r2) -{ - struct damon_addr_range tmp; - - tmp = *r1; - *r1 = *r2; - *r2 = tmp; -} - /* * Find three regions separated by two biggest unmapped regions * @@ -146,9 +138,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma, gap.start = last_vma->vm_end; gap.end = vma->vm_start; if (sz_range(&gap) > sz_range(&second_gap)) { - swap_ranges(&gap, &second_gap); + swap(gap, second_gap); if (sz_range(&second_gap) > sz_range(&first_gap)) - swap_ranges(&second_gap, &first_gap); + swap(second_gap, first_gap); } next: last_vma = vma; @@ -159,7 +151,7 @@ next: /* Sort the two biggest gaps by address */ if (first_gap.start > second_gap.start) - swap_ranges(&first_gap, &second_gap); + swap(first_gap, second_gap); /* Store the result */ regions[0].start = ALIGN(start, DAMON_MIN_REGION); @@ -240,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t, static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { + struct damon_target *ti; struct damon_region *r; struct damon_addr_range regions[3]; unsigned long sz = 0, nr_pieces; - int i; + int i, tidx = 0; if (damon_va_three_regions(t, regions)) { - pr_err("Failed to get three regions of target %lu\n", t->id); + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + tidx++; + } + pr_debug("Failed to get three regions of %dth target\n", tidx); return; } @@ -272,7 +270,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, } /* Initialize '->regions_list' of every target (task) */ -void damon_va_init(struct damon_ctx *ctx) +static void damon_va_init(struct damon_ctx *ctx) { struct damon_target *t; @@ -292,7 +290,8 @@ void damon_va_init(struct damon_ctx *ctx) * * Returns true if it is. */ -static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +static bool damon_intersect(struct damon_region *r, + struct damon_addr_range *re) { return !(r->ar.end <= re->start || re->end <= r->ar.start); } @@ -356,7 +355,7 @@ static void damon_va_apply_three_regions(struct damon_target *t, /* * Update regions for current memory mappings */ -void damon_va_update(struct damon_ctx *ctx) +static void damon_va_update(struct damon_ctx *ctx) { struct damon_addr_range three_regions[3]; struct damon_target *t; @@ -395,8 +394,65 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + pte_t entry = huge_ptep_get(pte); + struct page *page = pte_page(entry); + + if (!page) + return; + + get_page(page); + + if (pte_young(entry)) { + referenced = true; + entry = pte_mkold(entry); + huge_ptep_set_access_flags(vma, addr, pte, entry, + vma->vm_flags & VM_WRITE); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + huge_page_size(hstate_vma(vma)))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_mkold_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) @@ -410,7 +466,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void damon_va_prepare_access_check(struct damon_ctx *ctx, +static void __damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -418,7 +474,7 @@ static void damon_va_prepare_access_check(struct damon_ctx *ctx, damon_va_mkold(mm, r->sampling_addr); } -void damon_va_prepare_access_checks(struct damon_ctx *ctx) +static void damon_va_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -429,7 +485,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(ctx, mm, r); mmput(mm); } } @@ -491,8 +547,47 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct damon_young_walk_private *priv = walk->private; + struct hstate *h = hstate_vma(walk->vma); + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + page = pte_page(entry); + if (!page) + goto out; + + get_page(page); + + if (pte_young(entry) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = huge_page_size(h); + priv->young = true; + } + + put_page(page); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_young_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, @@ -515,7 +610,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void damon_va_check_access(struct damon_ctx *ctx, +static void __damon_va_check_access(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { static struct mm_struct *last_mm; @@ -539,7 +634,7 @@ static void damon_va_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -551,7 +646,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) { - damon_va_check_access(ctx, mm, r); + __damon_va_check_access(ctx, mm, r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } mmput(mm); @@ -579,32 +674,34 @@ bool damon_va_target_valid(void *target) } #ifndef CONFIG_ADVISE_SYSCALLS -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { - return -EINVAL; + return 0; } #else -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { struct mm_struct *mm; - int ret = -ENOMEM; + unsigned long start = PAGE_ALIGN(r->ar.start); + unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long applied; mm = damon_get_mm(target); if (!mm) - goto out; + return 0; - ret = do_madvise(mm, PAGE_ALIGN(r->ar.start), - PAGE_ALIGN(r->ar.end - r->ar.start), behavior); + applied = do_madvise(mm, start, len, behavior) ? 0 : len; mmput(mm); -out: - return ret; + + return applied; } #endif /* CONFIG_ADVISE_SYSCALLS */ -int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { int madv_action; @@ -627,14 +724,15 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, case DAMOS_STAT: return 0; default: - return -EINVAL; + return 0; } return damos_madvise(t, r, madv_action); } -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_va_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { diff --git a/mm/debug.c b/mm/debug.c index a05a39ff8fe4..bc9ac87f0e08 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -112,56 +112,8 @@ static void __dump_page(struct page *page) type = "ksm "; else if (PageAnon(page)) type = "anon "; - else if (mapping) { - struct inode *host; - const struct address_space_operations *a_ops; - struct hlist_node *dentry_first; - struct dentry *dentry_ptr; - struct dentry dentry; - unsigned long ino; - - /* - * mapping can be invalid pointer and we don't want to crash - * accessing it, so probe everything depending on it carefully - */ - if (get_kernel_nofault(host, &mapping->host) || - get_kernel_nofault(a_ops, &mapping->a_ops)) { - pr_warn("failed to read mapping contents, not a valid kernel address?\n"); - goto out_mapping; - } - - if (!host) { - pr_warn("aops:%ps\n", a_ops); - goto out_mapping; - } - - if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || - get_kernel_nofault(ino, &host->i_ino)) { - pr_warn("aops:%ps with invalid host inode %px\n", - a_ops, host); - goto out_mapping; - } - - if (!dentry_first) { - pr_warn("aops:%ps ino:%lx\n", a_ops, ino); - goto out_mapping; - } - - dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { - pr_warn("aops:%ps ino:%lx with invalid dentry %px\n", - a_ops, ino, dentry_ptr); - } else { - /* - * if dentry is corrupted, the %pd handler may still - * crash, but it's unlikely that we reach here with a - * corrupted struct page - */ - pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", - a_ops, ino, &dentry); - } - } -out_mapping: + else if (mapping) + dump_mapping(mapping); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %pGp%s\n", type, &head->flags, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 228e3954b90c..a7ac97c76762 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -652,7 +652,7 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) set_pte_at(args->mm, args->vaddr, args->ptep, pte); flush_dcache_page(page); barrier(); - pte_clear(args->mm, args->vaddr, args->ptep); + ptep_clear(args->mm, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); } @@ -888,8 +888,8 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) pr_debug("Validating swap migration\n"); /* - * make_migration_entry() expects given page to be - * locked, otherwise it stumbles upon a BUG_ON(). + * make_[readable|writable]_migration_entry() expects given page to + * be locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); swp = make_writable_migration_entry(page_to_pfn(page)); diff --git a/mm/dmapool.c b/mm/dmapool.c index 64b537b3ccb0..a7eb5d0eb2da 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -152,7 +152,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if ((boundary < size) || (boundary & (boundary - 1))) return NULL; - retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + retval = kmalloc(sizeof(*retval), GFP_KERNEL); if (!retval) return retval; diff --git a/mm/filemap.c b/mm/filemap.c index 39c4c46c6133..2fd9b2f24025 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -121,99 +121,97 @@ */ static void page_cache_delete(struct address_space *mapping, - struct page *page, void *shadow) + struct folio *folio, void *shadow) { - XA_STATE(xas, &mapping->i_pages, page->index); - unsigned int nr = 1; + XA_STATE(xas, &mapping->i_pages, folio->index); + long nr = 1; mapping_set_update(&xas, mapping); /* hugetlb pages are represented by a single entry in the xarray */ - if (!PageHuge(page)) { - xas_set_order(&xas, page->index, compound_order(page)); - nr = compound_nr(page); + if (!folio_test_hugetlb(folio)) { + xas_set_order(&xas, folio->index, folio_order(folio)); + nr = folio_nr_pages(folio); } - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(nr != 1 && shadow, page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); xas_store(&xas, shadow); xas_init_marks(&xas); - page->mapping = NULL; + folio->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } -static void unaccount_page_cache_page(struct address_space *mapping, - struct page *page) +static void filemap_unaccount_folio(struct address_space *mapping, + struct folio *folio) { - int nr; + long nr; /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave * stale data around in the cleancache once our page is gone */ - if (PageUptodate(page) && PageMappedToDisk(page)) - cleancache_put_page(page); + if (folio_test_uptodate(folio) && folio_test_mappedtodisk(folio)) + cleancache_put_page(&folio->page); else - cleancache_invalidate_page(mapping, page); + cleancache_invalidate_page(mapping, &folio->page); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(page_mapped(page), page); - if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { + VM_BUG_ON_FOLIO(folio_mapped(folio), folio); + if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { int mapcount; pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", - current->comm, page_to_pfn(page)); - dump_page(page, "still mapped when deleted"); + current->comm, folio_pfn(folio)); + dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - mapcount = page_mapcount(page); + mapcount = page_mapcount(&folio->page); if (mapping_exiting(mapping) && - page_count(page) >= mapcount + 2) { + folio_ref_count(folio) >= mapcount + 2) { /* * All vmas have already been torn down, so it's - * a good bet that actually the page is unmapped, + * a good bet that actually the folio is unmapped, * and we'd prefer not to leak it: if we're wrong, * some other bad page check should catch it later. */ - page_mapcount_reset(page); - page_ref_sub(page, mapcount); + page_mapcount_reset(&folio->page); + folio_ref_sub(folio, mapcount); } } - /* hugetlb pages do not participate in page cache accounting. */ - if (PageHuge(page)) + /* hugetlb folios do not participate in page cache accounting. */ + if (folio_test_hugetlb(folio)) return; - nr = thp_nr_pages(page); + nr = folio_nr_pages(folio); - __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); - if (PageSwapBacked(page)) { - __mod_lruvec_page_state(page, NR_SHMEM, -nr); - if (PageTransHuge(page)) - __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr); - } else if (PageTransHuge(page)) { - __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + } else if (folio_test_pmd_mappable(folio)) { + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } /* - * At this point page must be either written or cleaned by - * truncate. Dirty page here signals a bug and loss of + * At this point folio must be either written or cleaned by + * truncate. Dirty folio here signals a bug and loss of * unwritten data. * - * This fixes dirty accounting after removing the page entirely - * but leaves PageDirty set: it has no effect for truncated - * page and anyway will be cleared before returning page into + * This fixes dirty accounting after removing the folio entirely + * but leaves the dirty flag set: it has no effect for truncated + * folio and anyway will be cleared before returning folio to * buddy allocator. */ - if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); + if (WARN_ON_ONCE(folio_test_dirty(folio))) + folio_account_cleaned(folio, mapping, + inode_to_wb(mapping->host)); } /* @@ -221,87 +219,83 @@ static void unaccount_page_cache_page(struct address_space *mapping, * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the i_pages lock. */ -void __delete_from_page_cache(struct page *page, void *shadow) +void __filemap_remove_folio(struct folio *folio, void *shadow) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; - trace_mm_filemap_delete_from_page_cache(page); - - unaccount_page_cache_page(mapping, page); - page_cache_delete(mapping, page, shadow); + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); + page_cache_delete(mapping, folio, shadow); } -static void page_cache_free_page(struct address_space *mapping, - struct page *page) +void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*freepage)(struct page *); freepage = mapping->a_ops->freepage; if (freepage) - freepage(page); + freepage(&folio->page); - if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, thp_nr_pages(page)); - VM_BUG_ON_PAGE(page_count(page) <= 0, page); + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) { + folio_ref_sub(folio, folio_nr_pages(folio)); + VM_BUG_ON_FOLIO(folio_ref_count(folio) <= 0, folio); } else { - put_page(page); + folio_put(folio); } } /** - * delete_from_page_cache - delete page from page cache - * @page: the page which the kernel is trying to remove from page cache + * filemap_remove_folio - Remove folio from page cache. + * @folio: The folio. * - * This must be called only on pages that have been verified to be in the page - * cache and locked. It will never put the page into the free list, the caller - * has a reference on the page. + * This must be called only on folios that are locked and have been + * verified to be in the page cache. It will never put the folio into + * the free list because the caller has a reference on the page. */ -void delete_from_page_cache(struct page *page) +void filemap_remove_folio(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio->mapping; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - __delete_from_page_cache(page, NULL); + __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - page_cache_free_page(mapping, page); + filemap_free_folio(mapping, folio); } -EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_delete_batch - delete several pages from page cache - * @mapping: the mapping to which pages belong - * @pvec: pagevec with pages to delete + * page_cache_delete_batch - delete several folios from page cache + * @mapping: the mapping to which folios belong + * @fbatch: batch of folios to delete * - * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index - * and is optimised for it to be dense. - * It tolerates holes in @pvec (mapping entries at those indices are not - * modified). The function expects only THP head pages to be present in the - * @pvec. + * The function walks over mapping->i_pages and removes folios passed in + * @fbatch from the mapping. The function expects @fbatch to be sorted + * by page index and is optimised for it to be dense. + * It tolerates holes in @fbatch (mapping entries at those indices are not + * modified). * * The function expects the i_pages lock to be held. */ static void page_cache_delete_batch(struct address_space *mapping, - struct pagevec *pvec) + struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); - int total_pages = 0; + XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); + long total_pages = 0; int i = 0; - struct page *page; + struct folio *folio; mapping_set_update(&xas, mapping); - xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec)) + xas_for_each(&xas, folio, ULONG_MAX) { + if (i >= folio_batch_count(fbatch)) break; /* A swap/dax/shadow entry got inserted? Skip it. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; /* * A page got inserted in our range? Skip it. We have our @@ -310,54 +304,48 @@ static void page_cache_delete_batch(struct address_space *mapping, * means our page has been removed, which shouldn't be * possible because we're holding the PageLock. */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, - page); + if (folio != fbatch->folios[i]) { + VM_BUG_ON_FOLIO(folio->index > + fbatch->folios[i]->index, folio); continue; } - WARN_ON_ONCE(!PageLocked(page)); + WARN_ON_ONCE(!folio_test_locked(folio)); - if (page->index == xas.xa_index) - page->mapping = NULL; - /* Leave page->index set: truncation lookup relies on it */ + folio->mapping = NULL; + /* Leave folio->index set: truncation lookup relies on it */ - /* - * Move to the next page in the vector if this is a regular - * page or the index is of the last sub-page of this compound - * page. - */ - if (page->index + compound_nr(page) - 1 == xas.xa_index) - i++; + i++; xas_store(&xas, NULL); - total_pages++; + total_pages += folio_nr_pages(folio); } mapping->nrpages -= total_pages; } void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec) + struct folio_batch *fbatch) { int i; - if (!pagevec_count(pvec)) + if (!folio_batch_count(fbatch)) return; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - for (i = 0; i < pagevec_count(pvec); i++) { - trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; - unaccount_page_cache_page(mapping, pvec->pages[i]); + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); } - page_cache_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - for (i = 0; i < pagevec_count(pvec); i++) - page_cache_free_page(mapping, pvec->pages[i]); + for (i = 0; i < folio_batch_count(fbatch); i++) + filemap_free_folio(mapping, fbatch->folios[i]); } int filemap_check_errors(struct address_space *mapping) @@ -646,8 +634,8 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } -static bool filemap_range_has_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) +bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; @@ -667,34 +655,8 @@ static bool filemap_range_has_writeback(struct address_space *mapping, } rcu_read_unlock(); return page != NULL; - -} - -/** - * filemap_range_needs_writeback - check if range potentially needs writeback - * @mapping: address space within which to check - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Find at least one page in the range supplied, usually used to check if - * direct writing in this range will trigger a writeback. Used by O_DIRECT - * read/write with IOCB_NOWAIT, to see if the caller needs to do - * filemap_write_and_wait_range() before proceeding. - * - * Return: %true if the caller should do filemap_write_and_wait_range() before - * doing O_DIRECT to a page in this range, %false otherwise. - */ -bool filemap_range_needs_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) -{ - if (!mapping_needs_writeback(mapping)) - return false; - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - return false; - return filemap_range_has_writeback(mapping, start_byte, end_byte); } -EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); +EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range @@ -959,7 +921,7 @@ unlock: goto error; } - trace_mm_filemap_add_to_page_cache(&folio->page); + trace_mm_filemap_add_to_page_cache(folio); return 0; error: folio->mapping = NULL; @@ -1259,10 +1221,10 @@ enum behavior { * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like - * wait_on_page_writeback() waiting on PG_writeback. + * folio_wait_writeback() waiting on PG_writeback. */ DROP, /* Drop ref to page before wait, no check when woken, - * like put_and_wait_on_page_locked() on PG_locked. + * like folio_put_wait_locked() on PG_locked. */ }; @@ -1439,22 +1401,21 @@ int folio_wait_bit_killable(struct folio *folio, int bit_nr) EXPORT_SYMBOL(folio_wait_bit_killable); /** - * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked - * @page: The page to wait for. + * folio_put_wait_locked - Drop a reference and wait for it to be unlocked + * @folio: The folio to wait for. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * - * The caller should hold a reference on @page. They expect the page to + * The caller should hold a reference on @folio. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration - * (for example) by holding the reference while waiting for the page to + * (for example) by holding the reference while waiting for the folio to * come unlocked. After this function returns, the caller should not - * dereference @page. + * dereference @folio. * - * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal. + * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ -int put_and_wait_on_page_locked(struct page *page, int state) +int folio_put_wait_locked(struct folio *folio, int state) { - return folio_wait_bit_common(page_folio(page), PG_locked, state, - DROP); + return folio_wait_bit_common(folio, PG_locked, state, DROP); } /** @@ -1979,37 +1940,36 @@ no_page: } EXPORT_SYMBOL(__filemap_get_folio); -static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, +static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { - struct page *page; + struct folio *folio; retry: if (mark == XA_PRESENT) - page = xas_find(xas, max); + folio = xas_find(xas, max); else - page = xas_find_marked(xas, max, mark); + folio = xas_find_marked(xas, max, mark); - if (xas_retry(xas, page)) + if (xas_retry(xas, folio)) goto retry; /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it * without attempting to raise page count. */ - if (!page || xa_is_value(page)) - return page; + if (!folio || xa_is_value(folio)) + return folio; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto reset; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(xas))) { - put_page(page); + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); goto reset; } - return page; + return folio; reset: xas_reset(xas); goto retry; @@ -2020,56 +1980,36 @@ reset: * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). - * @pvec: Where the resulting entries are placed. + * @fbatch: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * * find_get_entries() will search for and return a batch of entries in - * the mapping. The entries are placed in @pvec. find_get_entries() - * takes a reference on any actual pages it returns. + * the mapping. The entries are placed in @fbatch. find_get_entries() + * takes a reference on any actual folios it returns. * - * The search returns a group of mapping-contiguous page cache entries - * with ascending indexes. There may be holes in the indices due to - * not-present pages. + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries or large folios. * - * Any shadow entries of evicted pages, or swap entries from + * Any shadow entries of evicted folios, or swap entries from * shmem/tmpfs, are included in the returned array. * - * If it finds a Transparent Huge Page, head or tail, find_get_entries() - * stops at that page: the caller is likely to have a better way to handle - * the compound page as a whole, and then skip its extent, than repeatedly - * calling find_get_entries() to return all its tails. - * - * Return: the number of pages and shadow entries which were found. + * Return: The number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices) + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; - unsigned int ret = 0; - unsigned nr_entries = PAGEVEC_SIZE; + struct folio *folio; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { - /* - * Terminate early on finding a THP, to allow the caller to - * handle it all at once; but continue if this is hugetlbfs. - */ - if (!xa_is_value(page) && PageTransHuge(page) && - !PageHuge(page)) { - page = find_subpage(page, xas.xa_index); - nr_entries = ret + 1; - } - - indices[ret] = xas.xa_index; - pvec->pages[ret] = page; - if (++ret == nr_entries) + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) break; } rcu_read_unlock(); - pvec->nr = ret; - return ret; + return folio_batch_count(fbatch); } /** @@ -2077,63 +2017,64 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, * @mapping: The address_space to search. * @start: The starting page cache index. * @end: The final page index (inclusive). - * @pvec: Where the resulting entries are placed. - * @indices: The cache indices of the entries in @pvec. + * @fbatch: Where the resulting entries are placed. + * @indices: The cache indices of the entries in @fbatch. * * find_lock_entries() will return a batch of entries from @mapping. - * Swap, shadow and DAX entries are included. Pages are returned - * locked and with an incremented refcount. Pages which are locked by - * somebody else or under writeback are skipped. Only the head page of - * a THP is returned. Pages which are partially outside the range are - * not returned. + * Swap, shadow and DAX entries are included. Folios are returned + * locked and with an incremented refcount. Folios which are locked + * by somebody else or under writeback are skipped. Folios which are + * partially outside the range are not returned. * * The entries have ascending indexes. The indices may not be consecutive - * due to not-present entries, THP pages, pages which could not be locked - * or pages under writeback. + * due to not-present entries, large folios, folios which could not be + * locked or folios under writeback. * * Return: The number of entries which were found. */ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices) + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { - if (!xa_is_value(page)) { - if (page->index < start) + while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + if (!xa_is_value(folio)) { + if (folio->index < start) goto put; - if (page->index + thp_nr_pages(page) - 1 > end) + if (folio->index + folio_nr_pages(folio) - 1 > end) goto put; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto put; - if (page->mapping != mapping || PageWriteback(page)) + if (folio->mapping != mapping || + folio_test_writeback(folio)) goto unlock; - VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index), - page); + VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), + folio); } - indices[pvec->nr] = xas.xa_index; - if (!pagevec_add(pvec, page)) + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) break; - goto next; + continue; unlock: - unlock_page(page); + folio_unlock(folio); put: - put_page(page); -next: - if (!xa_is_value(page) && PageTransHuge(page)) { - unsigned int nr_pages = thp_nr_pages(page); - - /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ - xas_set(&xas, page->index + nr_pages); - if (xas.xa_index < nr_pages) - break; - } + folio_put(folio); } rcu_read_unlock(); - return pagevec_count(pvec); + return folio_batch_count(fbatch); +} + +static inline +bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) +{ + if (!folio_test_large(folio) || folio_test_hugetlb(folio)) + return false; + if (index >= max) + return false; + return index < folio->index + folio_nr_pages(folio) - 1; } /** @@ -2162,23 +2103,29 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, struct page **pages) { XA_STATE(xas, &mapping->i_pages, *start); - struct page *page; + struct folio *folio; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { /* Skip over shadow, swap and DAX entries */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - pages[ret] = find_subpage(page, xas.xa_index); +again: + pages[ret] = folio_file_page(folio, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } + if (folio_more_pages(folio, xas.xa_index, end)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } } /* @@ -2213,36 +2160,41 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { XA_STATE(xas, &mapping->i_pages, index); - struct page *page; + struct folio *folio; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - for (page = xas_load(&xas); page; page = xas_next(&xas)) { - if (xas_retry(&xas, page)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; /* * If the entry has been swapped out, we can stop looking. * No current caller is looking for DAX entries. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) break; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto retry; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) + if (unlikely(folio != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); +again: + pages[ret] = folio_file_page(folio, xas.xa_index); if (++ret == nr_pages) break; + if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } continue; put_page: - put_page(page); + folio_put(folio); retry: xas_reset(&xas); } @@ -2271,25 +2223,25 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, struct page **pages) { XA_STATE(xas, &mapping->i_pages, *index); - struct page *page; + struct folio *folio; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, tag))) { + while ((folio = find_get_entry(&xas, end, tag))) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict * a page we saw tagged. Skip over it. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - pages[ret] = page; + pages[ret] = &folio->page; if (++ret == nr_pages) { - *index = page->index + thp_nr_pages(page); + *index = folio->index + folio_nr_pages(folio); goto out; } } @@ -2332,52 +2284,50 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) } /* - * filemap_get_read_batch - Get a batch of pages for read + * filemap_get_read_batch - Get a batch of folios for read * - * Get a batch of pages which represent a contiguous range of bytes - * in the file. No tail pages will be returned. If @index is in the - * middle of a THP, the entire THP will be returned. The last page in - * the batch may have Readahead set or be not Uptodate so that the - * caller can take the appropriate action. + * Get a batch of folios which represent a contiguous range of bytes in + * the file. No exceptional entries will be returned. If @index is in + * the middle of a folio, the entire folio will be returned. The last + * folio in the batch may have the readahead flag set or the uptodate flag + * clear so that the caller can take the appropriate action. */ static void filemap_get_read_batch(struct address_space *mapping, - pgoff_t index, pgoff_t max, struct pagevec *pvec) + pgoff_t index, pgoff_t max, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, index); - struct page *head; + struct folio *folio; rcu_read_lock(); - for (head = xas_load(&xas); head; head = xas_next(&xas)) { - if (xas_retry(&xas, head)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; - if (xas.xa_index > max || xa_is_value(head)) + if (xas.xa_index > max || xa_is_value(folio)) break; - if (!page_cache_get_speculative(head)) + if (!folio_try_get_rcu(folio)) goto retry; - /* Has the page moved or been split? */ - if (unlikely(head != xas_reload(&xas))) - goto put_page; + if (unlikely(folio != xas_reload(&xas))) + goto put_folio; - if (!pagevec_add(pvec, head)) + if (!folio_batch_add(fbatch, folio)) break; - if (!PageUptodate(head)) + if (!folio_test_uptodate(folio)) break; - if (PageReadahead(head)) + if (folio_test_readahead(folio)) break; - xas.xa_index = head->index + thp_nr_pages(head) - 1; - xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK; + xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1); continue; -put_page: - put_page(head); +put_folio: + folio_put(folio); retry: xas_reset(&xas); } rcu_read_unlock(); } -static int filemap_read_page(struct file *file, struct address_space *mapping, - struct page *page) +static int filemap_read_folio(struct file *file, struct address_space *mapping, + struct folio *folio) { int error; @@ -2386,52 +2336,51 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, * eg. multipath errors. PG_error will be set again if readpage * fails. */ - ClearPageError(page); + folio_clear_error(folio); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(file, page); + error = mapping->a_ops->readpage(file, &folio->page); if (error) return error; - error = wait_on_page_locked_killable(page); + error = folio_wait_locked_killable(folio); if (error) return error; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; shrink_readahead_size_eio(&file->f_ra); return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, - loff_t pos, struct iov_iter *iter, struct page *page) + loff_t pos, struct iov_iter *iter, struct folio *folio) { int count; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ if (iov_iter_is_pipe(iter)) return false; if (!mapping->a_ops->is_partially_uptodate) return false; - if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page))) + if (mapping->host->i_blkbits >= folio_shift(folio)) return false; count = iter->count; - if (page_offset(page) > pos) { - count -= page_offset(page) - pos; + if (folio_pos(folio) > pos) { + count -= folio_pos(folio) - pos; pos = 0; } else { - pos -= page_offset(page); + pos -= folio_pos(folio); } - return mapping->a_ops->is_partially_uptodate(page, pos, count); + return mapping->a_ops->is_partially_uptodate(&folio->page, pos, count); } static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *iter, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); int error; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -2447,7 +2396,11 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); - put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE); + /* + * This is where we usually end up waiting for a + * previously submitted readahead to finish. + */ + folio_put_wait_locked(folio, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __folio_lock_async(folio, iocb->ki_waitq); @@ -2460,14 +2413,14 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock; error = 0; - if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page)) + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; - error = filemap_read_page(iocb->ki_filp, mapping, &folio->page); + error = filemap_read_folio(iocb->ki_filp, mapping, folio); goto unlock_mapping; unlock: folio_unlock(folio); @@ -2478,70 +2431,72 @@ unlock_mapping: return error; } -static int filemap_create_page(struct file *file, +static int filemap_create_folio(struct file *file, struct address_space *mapping, pgoff_t index, - struct pagevec *pvec) + struct folio_batch *fbatch) { - struct page *page; + struct folio *folio; int error; - page = page_cache_alloc(mapping); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); + if (!folio) return -ENOMEM; /* - * Protect against truncate / hole punch. Grabbing invalidate_lock here - * assures we cannot instantiate and bring uptodate new pagecache pages - * after evicting page cache during truncate and before actually - * freeing blocks. Note that we could release invalidate_lock after - * inserting the page into page cache as the locked page would then be - * enough to synchronize with hole punching. But there are code paths - * such as filemap_update_page() filling in partially uptodate pages or - * ->readpages() that need to hold invalidate_lock while mapping blocks - * for IO so let's hold the lock here as well to keep locking rules - * simple. + * Protect against truncate / hole punch. Grabbing invalidate_lock + * here assures we cannot instantiate and bring uptodate new + * pagecache folios after evicting page cache during truncate + * and before actually freeing blocks. Note that we could + * release invalidate_lock after inserting the folio into + * the page cache as the locked folio would then be enough to + * synchronize with hole punching. But there are code paths + * such as filemap_update_page() filling in partially uptodate + * pages or ->readpages() that need to hold invalidate_lock + * while mapping blocks for IO so let's hold the lock here as + * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - error = add_to_page_cache_lru(page, mapping, index, + error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) error = AOP_TRUNCATED_PAGE; if (error) goto error; - error = filemap_read_page(file, mapping, page); + error = filemap_read_folio(file, mapping, folio); if (error) goto error; filemap_invalidate_unlock_shared(mapping); - pagevec_add(pvec, page); + folio_batch_add(fbatch, folio); return 0; error: filemap_invalidate_unlock_shared(mapping); - put_page(page); + folio_put(folio); return error; } static int filemap_readahead(struct kiocb *iocb, struct file *file, - struct address_space *mapping, struct page *page, + struct address_space *mapping, struct folio *folio, pgoff_t last_index) { + DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; - page_cache_async_readahead(mapping, &file->f_ra, file, page, - page->index, last_index - page->index); + page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, - struct pagevec *pvec) + struct folio_batch *fbatch) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; - struct page *page; + struct folio *folio; int err = 0; last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); @@ -2549,34 +2504,35 @@ retry: if (fatal_signal_pending(current)) return -EINTR; - filemap_get_read_batch(mapping, index, last_index, pvec); - if (!pagevec_count(pvec)) { + filemap_get_read_batch(mapping, index, last_index, fbatch); + if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - filemap_get_read_batch(mapping, index, last_index, pvec); + filemap_get_read_batch(mapping, index, last_index, fbatch); } - if (!pagevec_count(pvec)) { + if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - err = filemap_create_page(filp, mapping, - iocb->ki_pos >> PAGE_SHIFT, pvec); + err = filemap_create_folio(filp, mapping, + iocb->ki_pos >> PAGE_SHIFT, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } - page = pvec->pages[pagevec_count(pvec) - 1]; - if (PageReadahead(page)) { - err = filemap_readahead(iocb, filp, mapping, page, last_index); + folio = fbatch->folios[folio_batch_count(fbatch) - 1]; + if (folio_test_readahead(folio)) { + err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } - if (!PageUptodate(page)) { - if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) + if (!folio_test_uptodate(folio)) { + if ((iocb->ki_flags & IOCB_WAITQ) && + folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; - err = filemap_update_page(iocb, mapping, iter, page); + err = filemap_update_page(iocb, mapping, iter, folio); if (err) goto err; } @@ -2584,8 +2540,8 @@ retry: return 0; err: if (err < 0) - put_page(page); - if (likely(--pvec->nr)) + folio_put(folio); + if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; @@ -2612,7 +2568,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct pagevec pvec; + struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; @@ -2623,7 +2579,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - pagevec_init(&pvec); + folio_batch_init(&fbatch); do { cond_resched(); @@ -2639,7 +2595,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter, &pvec); + error = filemap_get_pages(iocb, iter, &fbatch); if (error < 0) break; @@ -2653,7 +2609,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) - goto put_pages; + goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* @@ -2668,33 +2624,29 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, */ if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT) - mark_page_accessed(pvec.pages[0]); + folio_mark_accessed(fbatch.folios[0]); - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - size_t page_size = thp_size(page); - size_t offset = iocb->ki_pos & (page_size - 1); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + size_t fsize = folio_size(folio); + size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, - page_size - offset); + fsize - offset); size_t copied; - if (end_offset < page_offset(page)) + if (end_offset < folio_pos(folio)) break; if (i > 0) - mark_page_accessed(page); + folio_mark_accessed(folio); /* - * If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. + * If users can be writing to this folio using arbitrary + * virtual addresses, take care of potential aliasing + * before reading the folio on the kernel side. */ - if (writably_mapped) { - int j; + if (writably_mapped) + flush_dcache_folio(folio); - for (j = 0; j < thp_nr_pages(page); j++) - flush_dcache_page(page + j); - } - - copied = copy_page_to_iter(page, offset, bytes, iter); + copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; @@ -2705,10 +2657,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, break; } } -put_pages: - for (i = 0; i < pagevec_count(&pvec); i++) - put_page(pvec.pages[i]); - pagevec_reinit(&pvec); +put_folios: + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_put(fbatch.folios[i]); + folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); @@ -2793,44 +2745,44 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); -static inline loff_t page_seek_hole_data(struct xa_state *xas, - struct address_space *mapping, struct page *page, +static inline loff_t folio_seek_hole_data(struct xa_state *xas, + struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) { const struct address_space_operations *ops = mapping->a_ops; size_t offset, bsz = i_blocksize(mapping->host); - if (xa_is_value(page) || PageUptodate(page)) + if (xa_is_value(folio) || folio_test_uptodate(folio)) return seek_data ? start : end; if (!ops->is_partially_uptodate) return seek_data ? end : start; xas_pause(xas); rcu_read_unlock(); - lock_page(page); - if (unlikely(page->mapping != mapping)) + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) goto unlock; - offset = offset_in_thp(page, start) & ~(bsz - 1); + offset = offset_in_folio(folio, start) & ~(bsz - 1); do { - if (ops->is_partially_uptodate(page, offset, bsz) == seek_data) + if (ops->is_partially_uptodate(&folio->page, offset, bsz) == + seek_data) break; start = (start + bsz) & ~(bsz - 1); offset += bsz; - } while (offset < thp_size(page)); + } while (offset < folio_size(folio)); unlock: - unlock_page(page); + folio_unlock(folio); rcu_read_lock(); return start; } -static inline -unsigned int seek_page_size(struct xa_state *xas, struct page *page) +static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { - if (xa_is_value(page)) + if (xa_is_value(folio)) return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); - return thp_size(page); + return folio_size(folio); } /** @@ -2857,15 +2809,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); - struct page *page; + struct folio *folio; if (end <= start) return -ENXIO; rcu_read_lock(); - while ((page = find_get_entry(&xas, max, XA_PRESENT))) { + while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; - unsigned int seek_size; + size_t seek_size; if (start < pos) { if (!seek_data) @@ -2873,9 +2825,9 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, start = pos; } - seek_size = seek_page_size(&xas, page); - pos = round_up(pos + 1, seek_size); - start = page_seek_hole_data(&xas, mapping, page, start, pos, + seek_size = seek_folio_size(&xas, folio); + pos = round_up((u64)pos + 1, seek_size); + start = folio_seek_hole_data(&xas, mapping, folio, start, pos, seek_data); if (start < pos) goto unlock; @@ -2883,15 +2835,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, break; if (seek_size > PAGE_SIZE) xas_set(&xas, pos >> PAGE_SHIFT); - if (!xa_is_value(page)) - put_page(page); + if (!xa_is_value(folio)) + folio_put(folio); } if (seek_data) start = -ENXIO; unlock: rcu_read_unlock(); - if (page && !xa_is_value(page)) - put_page(page); + if (folio && !xa_is_value(folio)) + folio_put(folio); if (start > end) return end; return start; @@ -2900,21 +2852,20 @@ unlock: #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* - * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock + * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. - * @page - the page to lock. + * @folio - the folio to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * - * This works similar to lock_page_or_retry in that it can drop the mmap_lock. - * It differs in that it actually returns the page locked if it returns 1 and 0 - * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin - * will point to the pinned file and needs to be fput()'ed at a later point. + * This works similar to lock_folio_or_retry in that it can drop the + * mmap_lock. It differs in that it actually returns the folio locked + * if it returns 1 and 0 if it couldn't lock the folio. If we did have + * to drop the mmap_lock then fpin will point to the pinned file and + * needs to be fput()'ed at a later point. */ -static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, +static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, struct file **fpin) { - struct folio *folio = page_folio(page); - if (folio_trylock(folio)) return 1; @@ -3003,25 +2954,25 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, - struct page *page) + struct folio *folio) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; - struct address_space *mapping = file->f_mapping; + DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; - pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; + mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); - if (PageReadahead(page)) { + + if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_async_readahead(mapping, ra, file, - page, offset, ra->ra_pages); + page_cache_async_ra(&ractl, folio, ra->ra_pages); } return fpin; } @@ -3040,7 +2991,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock - * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). + * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. @@ -3056,28 +3007,27 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - pgoff_t offset = vmf->pgoff; - pgoff_t max_off; - struct page *page; + pgoff_t max_idx, index = vmf->pgoff; + struct folio *folio; vm_fault_t ret = 0; bool mapping_locked = false; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; /* * Do we have something in the page cache already? */ - page = find_get_page(mapping, offset); - if (likely(page)) { + folio = filemap_get_folio(mapping, index); + if (likely(folio)) { /* * We found the page, so try async readahead before waiting for * the lock. */ if (!(vmf->flags & FAULT_FLAG_TRIED)) - fpin = do_async_mmap_readahead(vmf, page); - if (unlikely(!PageUptodate(page))) { + fpin = do_async_mmap_readahead(vmf, folio); + if (unlikely(!folio_test_uptodate(folio))) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } @@ -3089,17 +3039,17 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) fpin = do_sync_mmap_readahead(vmf); retry_find: /* - * See comment in filemap_create_page() why we need + * See comment in filemap_create_folio() why we need * invalidate_lock */ if (!mapping_locked) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } - page = pagecache_get_page(mapping, offset, + folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); - if (!page) { + if (!folio) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); @@ -3107,22 +3057,22 @@ retry_find: } } - if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) goto out_retry; /* Did it get truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { - unlock_page(page); - put_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); goto retry_find; } - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. */ - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { /* * The page was in cache and uptodate and now it is not. * Strange but possible since we didn't hold the page lock all @@ -3130,8 +3080,8 @@ retry_find: * try again. */ if (!mapping_locked) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto retry_find; } goto page_not_uptodate; @@ -3143,7 +3093,7 @@ retry_find: * redo the fault. */ if (fpin) { - unlock_page(page); + folio_unlock(folio); goto out_retry; } if (mapping_locked) @@ -3153,14 +3103,14 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) { - unlock_page(page); - put_page(page); + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) { + folio_unlock(folio); + folio_put(folio); return VM_FAULT_SIGBUS; } - vmf->page = page; + vmf->page = folio_file_page(folio, index); return ret | VM_FAULT_LOCKED; page_not_uptodate: @@ -3171,10 +3121,10 @@ page_not_uptodate: * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - error = filemap_read_page(file, mapping, page); + error = filemap_read_folio(file, mapping, folio); if (fpin) goto out_retry; - put_page(page); + folio_put(folio); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; @@ -3188,8 +3138,8 @@ out_retry: * re-find the vma and come back and find our hopefully still populated * page. */ - if (page) - put_page(page); + if (folio) + folio_put(folio); if (mapping_locked) filemap_invalidate_unlock_shared(mapping); if (fpin) @@ -3231,48 +3181,48 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) return false; } -static struct page *next_uptodate_page(struct page *page, +static struct folio *next_uptodate_page(struct folio *folio, struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { unsigned long max_idx; do { - if (!page) + if (!folio) return NULL; - if (xas_retry(xas, page)) + if (xas_retry(xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - if (PageLocked(page)) + if (folio_test_locked(folio)) continue; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) continue; /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(xas))) + if (unlikely(folio != xas_reload(xas))) goto skip; - if (!PageUptodate(page) || PageReadahead(page)) + if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) goto skip; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto skip; - if (page->mapping != mapping) + if (folio->mapping != mapping) goto unlock; - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (xas->xa_index >= max_idx) goto unlock; - return page; + return folio; unlock: - unlock_page(page); + folio_unlock(folio); skip: - put_page(page); - } while ((page = xas_next_entry(xas, end_pgoff)) != NULL); + folio_put(folio); + } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); return NULL; } -static inline struct page *first_map_page(struct address_space *mapping, +static inline struct folio *first_map_page(struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { @@ -3280,7 +3230,7 @@ static inline struct page *first_map_page(struct address_space *mapping, mapping, xas, end_pgoff); } -static inline struct page *next_map_page(struct address_space *mapping, +static inline struct folio *next_map_page(struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { @@ -3297,16 +3247,17 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct folio *folio; + struct page *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); vm_fault_t ret = 0; rcu_read_lock(); - head = first_map_page(mapping, &xas, end_pgoff); - if (!head) + folio = first_map_page(mapping, &xas, end_pgoff); + if (!folio) goto out; - if (filemap_map_pmd(vmf, head)) { + if (filemap_map_pmd(vmf, &folio->page)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3314,7 +3265,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); do { - page = find_subpage(head, xas.xa_index); +again: + page = folio_file_page(folio, xas.xa_index); if (PageHWPoison(page)) goto unlock; @@ -3335,12 +3287,21 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, do_set_pte(vmf, page, addr); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, addr, vmf->pte); - unlock_page(head); + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } + folio_unlock(folio); continue; unlock: - unlock_page(head); - put_page(head); - } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + goto again; + } + folio_unlock(folio); + folio_put(folio); + } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); @@ -3352,24 +3313,24 @@ EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } /* - * We mark the page dirty already here so that when freeze is in + * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will - * see the dirty page and writeprotect it again. + * see the dirty folio and writeprotect it again. */ - set_page_dirty(page); - wait_for_stable_page(page); + folio_mark_dirty(folio); + folio_wait_stable(folio); out: sb_end_pagefault(mapping->host->i_sb); return ret; @@ -3422,35 +3383,20 @@ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); -static struct page *wait_on_page_read(struct page *page) +static struct folio *do_read_cache_folio(struct address_space *mapping, + pgoff_t index, filler_t filler, void *data, gfp_t gfp) { - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - put_page(page); - page = ERR_PTR(-EIO); - } - } - return page; -} - -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) -{ - struct page *page; + struct folio *folio; int err; repeat: - page = find_get_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp); - if (!page) + folio = filemap_get_folio(mapping, index); + if (!folio) { + folio = filemap_alloc_folio(gfp, 0); + if (!folio) return ERR_PTR(-ENOMEM); - err = add_to_page_cache_lru(page, mapping, index, gfp); + err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { - put_page(page); + folio_put(folio); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for xarray node */ @@ -3459,71 +3405,41 @@ repeat: filler: if (filler) - err = filler(data, page); + err = filler(data, &folio->page); else - err = mapping->a_ops->readpage(data, page); + err = mapping->a_ops->readpage(data, &folio->page); if (err < 0) { - put_page(page); + folio_put(folio); return ERR_PTR(err); } - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; + folio_wait_locked(folio); + if (!folio_test_uptodate(folio)) { + folio_put(folio); + return ERR_PTR(-EIO); + } + goto out; } - if (PageUptodate(page)) - goto out; - - /* - * Page is not up to date and may be locked due to one of the following - * case a: Page is being filled and the page lock is held - * case b: Read/write error clearing the page uptodate status - * case c: Truncation in progress (page locked) - * case d: Reclaim in progress - * - * Case a, the page will be up to date when the page is unlocked. - * There is no need to serialise on the page lock here as the page - * is pinned so the lock gives no additional protection. Even if the - * page is truncated, the data is still valid if PageUptodate as - * it's a race vs truncate race. - * Case b, the page will not be up to date - * Case c, the page may be truncated but in itself, the data may still - * be valid after IO completes as it's a read vs truncate race. The - * operation must restart if the page is not uptodate on unlock but - * otherwise serialising on page lock to stabilise the mapping gives - * no additional guarantees to the caller as the page lock is - * released before return. - * Case d, similar to truncation. If reclaim holds the page lock, it - * will be a race with remove_mapping that determines if the mapping - * is valid on unlock but otherwise the data is valid and there is - * no need to serialise with page lock. - * - * As the page lock gives no additional guarantee, we optimistically - * wait on the page to be unlocked and check if it's up to date and - * use the page if it is. Otherwise, the page lock is required to - * distinguish between the different cases. The motivation is that we - * avoid spurious serialisations and wakeups when multiple processes - * wait on the same page for IO to complete. - */ - wait_on_page_locked(page); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - /* Distinguish between all the cases under the safety of the lock */ - lock_page(page); + if (!folio_trylock(folio)) { + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); + goto repeat; + } - /* Case c or d, restart the operation */ - if (!page->mapping) { - unlock_page(page); - put_page(page); + /* Folio was truncated from mapping */ + if (!folio->mapping) { + folio_unlock(folio); + folio_put(folio); goto repeat; } /* Someone else locked and filled the page in a very small window */ - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(folio)) { + folio_unlock(folio); goto out; } @@ -3533,16 +3449,16 @@ filler: * Clear page error before actual read, PG_error will be * set again if read page fails. */ - ClearPageError(page); + folio_clear_error(folio); goto filler; out: - mark_page_accessed(page); - return page; + folio_mark_accessed(folio); + return folio; } /** - * read_cache_page - read into page cache, fill it if needed + * read_cache_folio - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read @@ -3557,10 +3473,27 @@ out: * * Return: up to date page on success, ERR_PTR() on failure. */ +struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, + filler_t filler, void *data) +{ + return do_read_cache_folio(mapping, index, filler, data, + mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(read_cache_folio); + +static struct page *do_read_cache_page(struct address_space *mapping, + pgoff_t index, filler_t *filler, void *data, gfp_t gfp) +{ + struct folio *folio; + + folio = do_read_cache_folio(mapping, index, filler, data, gfp); + if (IS_ERR(folio)) + return &folio->page; + return folio_file_page(folio, index); +} + struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) + pgoff_t index, filler_t *filler, void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -3920,33 +3853,32 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) EXPORT_SYMBOL(generic_file_write_iter); /** - * try_to_release_page() - release old fs-specific metadata on a page - * - * @page: the page which the kernel is trying to free - * @gfp_mask: memory allocation flags (and I/O mode) + * filemap_release_folio() - Release fs-specific metadata on a folio. + * @folio: The folio which the kernel is trying to free. + * @gfp: Memory allocation flags (and I/O mode). * - * The address_space is to try to release any data against the page - * (presumably at page->private). + * The address_space is trying to release any data attached to a folio + * (presumably at folio->private). * - * This may also be called if PG_fscache is set on a page, indicating that the - * page is known to the local caching routines. + * This will also be called if the private_2 flag is set on a page, + * indicating that the folio has other metadata associated with it. * - * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). + * The @gfp argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block + * (__GFP_RECLAIM & __GFP_FS). * - * Return: %1 if the release was successful, otherwise return zero. + * Return: %true if the release was successful, otherwise %false. */ -int try_to_release_page(struct page *page, gfp_t gfp_mask) +bool filemap_release_folio(struct folio *folio, gfp_t gfp) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = folio->mapping; - BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) - return 0; + BUG_ON(!folio_test_locked(folio)); + if (folio_test_writeback(folio)) + return false; if (mapping && mapping->a_ops->releasepage) - return mapping->a_ops->releasepage(page, gfp_mask); - return try_to_free_buffers(page); + return mapping->a_ops->releasepage(&folio->page, gfp); + return try_to_free_buffers(&folio->page); } - -EXPORT_SYMBOL(try_to_release_page); +EXPORT_SYMBOL(filemap_release_folio); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 5b6ae1da314e..749555a232a8 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -140,3 +140,14 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); + +void delete_from_page_cache(struct page *page) +{ + return filemap_remove_folio(page_folio(page)); +} + +int try_to_release_page(struct page *page, gfp_t gfp) +{ + return filemap_release_folio(page_folio(page), gfp); +} +EXPORT_SYMBOL(try_to_release_page); diff --git a/mm/frontswap.c b/mm/frontswap.c index 130e301c5ac0..6bed12260dea 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -127,7 +127,7 @@ void frontswap_register_ops(struct frontswap_ops *ops) spin_lock(&swap_lock); plist_for_each_entry(si, &swap_active_head, list) { if (!WARN_ON(!si->frontswap_map)) - set_bit(si->type, a); + __set_bit(si->type, a); } spin_unlock(&swap_lock); @@ -149,7 +149,7 @@ void frontswap_register_ops(struct frontswap_ops *ops) spin_lock(&swap_lock); plist_for_each_entry(si, &swap_active_head, list) { if (si->frontswap_map) - set_bit(si->type, b); + __set_bit(si->type, b); } spin_unlock(&swap_lock); @@ -642,12 +642,17 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } retry: if (!pmd_present(pmdval)) { + /* + * Should never reach here, if thp migration is not supported; + * Otherwise, it must be a thp migration entry. + */ + VM_BUG_ON(!thp_migration_supported() || + !is_pmd_migration_entry(pmdval)); + if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(pmdval)); - if (is_pmd_migration_entry(pmdval)) - pmd_migration_entry_wait(mm, pmd); + + pmd_migration_entry_wait(mm, pmd); pmdval = READ_ONCE(*pmd); /* * MADV_DONTNEED may convert the pmd to null because @@ -1672,21 +1677,22 @@ size_t fault_in_writeable(char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_write_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__put_user(0, uaddr) != 0)) - return size; + unsafe_put_user(0, uaddr, out); uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__put_user(0, uaddr) != 0)) - goto out; + unsafe_put_user(0, uaddr, out); uaddr += PAGE_SIZE; } out: + user_write_access_end(); if (size > uaddr - start) return size - (uaddr - start); return 0; @@ -1771,21 +1777,22 @@ size_t fault_in_readable(const char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_read_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__get_user(c, uaddr) != 0)) - return size; + unsafe_get_user(c, uaddr, out); uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__get_user(c, uaddr) != 0)) - goto out; + unsafe_get_user(c, uaddr, out); uaddr += PAGE_SIZE; } out: + user_read_access_end(); (void)c; if (size > uaddr - start) return size - (uaddr - start); @@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ - if (pte_special(pte) && !pte_devmap(pte) && + if (!vm_normal_page(walk->vma, addr, pte) && + !pte_devmap(pte) && !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); @@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && vma->vm_flags & VM_READ) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e5483347291c..406a3c28c026 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1322,7 +1322,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (reuse_swap_page(page, NULL)) { + if (reuse_swap_page(page)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -2542,38 +2542,28 @@ int total_mapcount(struct page *page) * need full accuracy to avoid breaking page pinning, because * page_trans_huge_mapcount() is slower than page_mapcount(). */ -int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +int page_trans_huge_mapcount(struct page *page) { - int i, ret, _total_mapcount, mapcount; + int i, ret; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; - } + if (likely(!PageTransCompound(page))) + return atomic_read(&page->_mapcount) + 1; page = compound_head(page); - _total_mapcount = ret = 0; + ret = 0; for (i = 0; i < thp_nr_pages(page); i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; + int mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); - _total_mapcount += mapcount; } - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) ret -= 1; - _total_mapcount -= thp_nr_pages(page); - } - mapcount = compound_mapcount(page); - ret += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; - return ret; + + return ret + compound_mapcount(page); } /* Racy check whether the huge page can be split */ @@ -2614,6 +2604,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct deferred_split *ds_queue = get_deferred_split_queue(head); + XA_STATE(xas, &head->mapping->i_pages, head->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; @@ -2652,6 +2643,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } + xas_split_alloc(&xas, head, compound_order(head), + mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); + if (xas_error(&xas)) { + ret = xas_error(&xas); + goto out; + } + anon_vma = NULL; i_mmap_lock_read(mapping); @@ -2681,13 +2679,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { - XA_STATE(xas, &mapping->i_pages, page_index(head)); - /* * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - xa_lock(&mapping->i_pages); + xas_lock(&xas); + xas_reset(&xas); if (xas_load(&xas) != head) goto fail; } @@ -2703,6 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { int nr = thp_nr_pages(head); + xas_split(&xas, head, thp_order(head)); if (PageSwapBacked(head)) { __mod_lruvec_page_state(head, NR_SHMEM_THPS, -nr); @@ -2719,7 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); local_irq_enable(); remap_page(head, thp_nr_pages(head)); ret = -EBUSY; @@ -2733,6 +2731,8 @@ out_unlock: if (mapping) i_mmap_unlock_read(mapping); out: + /* Free any memory we didn't use */ + xas_nomem(&xas, 0); count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1baa198519a..61895cc01d09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4684,8 +4684,8 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr struct page *new_page) { __SetPageUptodate(new_page); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugepage_add_new_anon_rmap(new_page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); ClearHPageRestoreReserve(new_page); SetHPageMigratable(new_page); @@ -5259,10 +5259,10 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); + set_huge_pte_at(mm, haddr, ptep, + make_huge_pte(vma, new_page, 1)); SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 79d93534ef1e..f9942841df18 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, } } +static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) +{ + int node; + + for_each_node(node) + kfree(h_cgroup->nodeinfo[node]); + kfree(h_cgroup); +} + static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; + int node; + + h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), + GFP_KERNEL); - h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; + /* + * TODO: this routine can waste much memory for nodes which will + * never be onlined. It's better to use memory hotplug callback + * function. + */ + for_each_node(node) { + /* Set node_to_alloc to -1 for offline nodes. */ + int node_to_alloc = + node_state(node, N_NORMAL_MEMORY) ? node : -1; + h_cgroup->nodeinfo[node] = + kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), + GFP_KERNEL, node_to_alloc); + if (!h_cgroup->nodeinfo[node]) + goto fail_alloc_nodeinfo; + } + hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; + +fail_alloc_nodeinfo: + hugetlb_cgroup_free(h_cgroup); + return ERR_PTR(-ENOMEM); } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { - struct hugetlb_cgroup *h_cgroup; - - h_cgroup = hugetlb_cgroup_from_css(css); - kfree(h_cgroup); + hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); } /* @@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, return; __set_hugetlb_cgroup(page, h_cg, rsvd); - return; + if (!rsvd) { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage + nr_pages); + } } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, @@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (rsvd) css_put(&h_cg->css); - - return; + else { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage - nr_pages); + } } void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, @@ -418,6 +466,59 @@ enum { RES_RSVD_FAILCNT, }; +static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) +{ + int nid; + struct cftype *cft = seq_cft(seq); + int idx = MEMFILE_IDX(cft->private); + bool legacy = MEMFILE_ATTR(cft->private); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + struct cgroup_subsys_state *css; + unsigned long usage; + + if (legacy) { + /* Add up usage across all nodes for the non-hierarchical total. */ + usage = 0; + for_each_node_state(nid, N_MEMORY) + usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); + seq_printf(seq, "total=%lu", usage * PAGE_SIZE); + + /* Simply print the per-node usage for the non-hierarchical total. */ + for_each_node_state(nid, N_MEMORY) + seq_printf(seq, " N%d=%lu", nid, + READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * + PAGE_SIZE); + seq_putc(seq, '\n'); + } + + /* + * The hierarchical total is pretty much the value recorded by the + * counter, so use that. + */ + seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", + page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); + + /* + * For each node, transverse the css tree to obtain the hierarchical + * node usage. + */ + for_each_node_state(nid, N_MEMORY) { + usage = 0; + rcu_read_lock(); + css_for_each_descendant_pre(css, &h_cg->css) { + usage += READ_ONCE(hugetlb_cgroup_from_css(css) + ->nodeinfo[nid] + ->usage[idx]); + } + rcu_read_unlock(); + seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); + } + + seq_putc(seq, '\n'); + + return 0; +} + static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) events_local_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_dfl[6]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_dfl[7]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, @@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx) cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_legacy[8]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->private = MEMFILE_PRIVATE(idx, 1); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_legacy[9]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, diff --git a/mm/internal.h b/mm/internal.h index 3b79a5c9427a..d80300392a19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -12,6 +12,8 @@ #include <linux/pagemap.h> #include <linux/tracepoint-defs.h> +struct folio_batch; + /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints @@ -21,7 +23,7 @@ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC) + __GFP_ATOMIC|__GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) @@ -74,6 +76,7 @@ static inline bool can_madv_lru_vma(struct vm_area_struct *vma) return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); } +struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, @@ -90,7 +93,13 @@ static inline void force_page_cache_readahead(struct address_space *mapping, } unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices); + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +void filemap_free_folio(struct address_space *mapping, struct folio *folio); +int truncate_inode_folio(struct address_space *mapping, struct folio *folio); +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, + loff_t end); /** * folio_evictable - Test whether a folio is evictable. @@ -158,11 +167,6 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* - * in mm/memcontrol.c: - */ -extern bool cgroup_memory_nokmem; - -/* * in mm/page_alloc.c */ @@ -388,6 +392,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); #ifdef CONFIG_MMU +void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_vma_page_range(struct vm_area_struct *vma, @@ -491,8 +496,8 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } return fpin; } - #else /* !CONFIG_MMU */ +static inline void unmap_mapping_folio(struct folio *folio) { } static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 587da8995f2d..08291ed33e93 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -132,12 +132,23 @@ static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) { void *object = qlink_to_object(qlink, cache); + struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); unsigned long flags; if (IS_ENABLED(CONFIG_SLAB)) local_irq_save(flags); /* + * If init_on_free is enabled and KASAN's free metadata is stored in + * the object, zero the metadata. Otherwise, the object's memory will + * not be properly zeroed, as KASAN saves the metadata after the slab + * allocator zeroes the object. + */ + if (slab_want_init_on_free(cache) && + cache->kasan_info.free_meta_offset == 0) + memzero_explicit(meta, sizeof(*meta)); + + /* * As the object now gets freed from the quarantine, assume that its * free track is no longer valid. */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 4a4929b29a23..94136f84b449 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -498,7 +498,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size) +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -520,9 +520,14 @@ int kasan_module_alloc(void *addr, size_t size) __builtin_return_address(0)); if (ret) { + struct vm_struct *vm = find_vm_area(addr); __memset(ret, KASAN_SHADOW_INIT, shadow_size); - find_vm_area(addr)->flags |= VM_KASAN; + vm->flags |= VM_KASAN; kmemleak_ignore(ret); + + if (vm->flags & VM_DEFER_KMEMLEAK) + kmemleak_vmalloc(vm, size, gfp_mask); + return 0; } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 267dfde43b91..5ad40e3add45 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -684,6 +684,7 @@ static const struct file_operations objects_fops = { .open = open_objects, .read = seq_read, .llseek = seq_lseek, + .release = seq_release, }; static int __init kfence_debugfs_init(void) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e99101162f1a..35f14d0a00a6 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -618,6 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } @@ -636,6 +637,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } @@ -681,7 +683,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { + !reuse_swap_page(page)) { /* * Page is in the swap cache and cannot be re-used. * It cannot be collapsed into a THP. @@ -756,11 +758,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * ptl mostly unnecessary. */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); } } else { @@ -774,11 +772,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * inside page_remove_rmap(). */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); @@ -1261,6 +1255,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } @@ -1270,6 +1265,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } @@ -1298,6 +1294,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } @@ -1306,7 +1303,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max + * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); @@ -1667,7 +1664,10 @@ static void collapse_file(struct mm_struct *mm, } count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); - /* This will be less messy when we use multi-index entries */ + /* + * Ensure we have slots for all the pages in the range. This is + * almost certainly a no-op because most of the pages must be present + */ do { xas_lock_irq(&xas); xas_create_range(&xas); @@ -1892,6 +1892,9 @@ out_unlock: __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } + /* Join all the small entries into a single multi-index entry */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, new_page); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -2008,11 +2011,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; } + /* + * XXX: khugepaged should compact smaller compound pages + * into a PMD sized page + */ if (PageTransCompound(page)) { result = SCAN_PAGE_COMPOUND; break; @@ -2054,6 +2062,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (result == SCAN_SUCCEED) { if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { node = khugepaged_find_target_node(); collapse_file(mm, file, start, hpage, node); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b57383c17cf6..dc3758fdba68 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -381,15 +381,20 @@ static void dump_object_info(struct kmemleak_object *object) static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { struct rb_node *rb = object_tree_root.rb_node; + unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); while (rb) { - struct kmemleak_object *object = - rb_entry(rb, struct kmemleak_object, rb_node); - if (ptr < object->pointer) + struct kmemleak_object *object; + unsigned long untagged_objp; + + object = rb_entry(rb, struct kmemleak_object, rb_node); + untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); + + if (untagged_ptr < untagged_objp) rb = object->rb_node.rb_left; - else if (object->pointer + object->size <= ptr) + else if (untagged_objp + object->size <= untagged_ptr) rb = object->rb_node.rb_right; - else if (object->pointer == ptr || alias) + else if (untagged_objp == untagged_ptr || alias) return object; else { kmemleak_warn("Found object by alias at 0x%08lx\n", @@ -576,6 +581,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object, *parent; struct rb_node **link, *rb_parent; unsigned long untagged_ptr; + unsigned long untagged_objp; object = mem_pool_alloc(gfp); if (!object) { @@ -629,9 +635,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, while (*link) { rb_parent = *link; parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); - if (ptr + size <= parent->pointer) + untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer); + if (untagged_ptr + size <= untagged_objp) link = &parent->rb_node.rb_left; - else if (parent->pointer + parent->size <= ptr) + else if (untagged_objp + parent->size <= untagged_ptr) link = &parent->rb_node.rb_right; else { kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", @@ -15,6 +15,7 @@ #include <linux/errno.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> @@ -2575,8 +2576,8 @@ struct page *ksm_might_need_to_copy(struct page *page, return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ - } else if (anon_vma->root == vma->anon_vma->root && - page->index == linear_page_index(vma, address)) { + } else if (page->index == linear_page_index(vma, address) && + anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (!PageUptodate(page)) diff --git a/mm/madvise.c b/mm/madvise.c index 8c927202bbe6..5604064df464 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,8 @@ #include <linux/fadvise.h> #include <linux/sched.h> #include <linux/sched/mm.h> +#include <linux/mm_inline.h> +#include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> @@ -62,83 +64,122 @@ static int madvise_need_mmap_write(int behavior) } } +#ifdef CONFIG_ANON_VMA_NAME +static struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + struct anon_vma_name *anon_name; + size_t count; + + /* Add 1 for NUL terminator at the end of the anon_name->name */ + count = strlen(name) + 1; + anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); + if (anon_name) { + kref_init(&anon_name->kref); + memcpy(anon_name->name, name, count); + } + + return anon_name; +} + +static void vma_anon_name_free(struct kref *kref) +{ + struct anon_vma_name *anon_name = + container_of(kref, struct anon_vma_name, kref); + kfree(anon_name); +} + +static inline bool has_vma_anon_name(struct vm_area_struct *vma) +{ + return !vma->vm_file && vma->anon_name; +} + +const char *vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return NULL; + + mmap_assert_locked(vma->vm_mm); + + return vma->anon_name->name; +} + +void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + if (!has_vma_anon_name(orig_vma)) + return; + + kref_get(&orig_vma->anon_name->kref); + new_vma->anon_name = orig_vma->anon_name; +} + +void free_vma_anon_name(struct vm_area_struct *vma) +{ + struct anon_vma_name *anon_name; + + if (!has_vma_anon_name(vma)) + return; + + anon_name = vma->anon_name; + vma->anon_name = NULL; + kref_put(&anon_name->kref, vma_anon_name_free); +} + +/* mmap_lock should be write-locked */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + const char *anon_name; + + if (!name) { + free_vma_anon_name(vma); + return 0; + } + + anon_name = vma_anon_name(vma); + if (anon_name) { + /* Same name, nothing to do here */ + if (!strcmp(name, anon_name)) + return 0; + + free_vma_anon_name(vma); + } + vma->anon_name = anon_vma_name_alloc(name); + if (!vma->anon_name) + return -ENOMEM; + + return 0; +} +#else /* CONFIG_ANON_VMA_NAME */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (name) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_ANON_VMA_NAME */ /* - * We can potentially split a vm area into separate - * areas, each area with its own behavior. + * Update the vm_flags on region of a vma, splitting it or merging it as + * necessary. Must be called with mmap_sem held for writing; */ -static long madvise_behavior(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) +static int madvise_update_vma(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long new_flags, + const char *name) { struct mm_struct *mm = vma->vm_mm; - int error = 0; + int error; pgoff_t pgoff; - unsigned long new_flags = vma->vm_flags; - switch (behavior) { - case MADV_NORMAL: - new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; - break; - case MADV_SEQUENTIAL: - new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; - break; - case MADV_RANDOM: - new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; - break; - case MADV_DONTFORK: - new_flags |= VM_DONTCOPY; - break; - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTCOPY; - break; - case MADV_WIPEONFORK: - /* MADV_WIPEONFORK is only supported on anonymous memory. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) { - error = -EINVAL; - goto out; - } - new_flags |= VM_WIPEONFORK; - break; - case MADV_KEEPONFORK: - new_flags &= ~VM_WIPEONFORK; - break; - case MADV_DONTDUMP: - new_flags |= VM_DONTDUMP; - break; - case MADV_DODUMP: - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTDUMP; - break; - case MADV_MERGEABLE: - case MADV_UNMERGEABLE: - error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) - goto out_convert_errno; - break; - case MADV_HUGEPAGE: - case MADV_NOHUGEPAGE: - error = hugepage_madvise(vma, &new_flags, behavior); - if (error) - goto out_convert_errno; - break; - } - - if (new_flags == vma->vm_flags) { + if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) { *prev = vma; - goto out; + return 0; } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, name); if (*prev) { vma = *prev; goto success; @@ -147,23 +188,19 @@ static long madvise_behavior(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, start, 1); if (error) - goto out_convert_errno; + return error; } if (end != vma->vm_end) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, end, 0); if (error) - goto out_convert_errno; + return error; } success: @@ -171,16 +208,13 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; + if (!vma->vm_file) { + error = replace_vma_anon_name(vma, name); + if (error) + return error; + } -out_convert_errno: - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; -out: - return error; + return 0; } #ifdef CONFIG_SWAP @@ -930,6 +964,95 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +/* + * Apply an madvise behavior to a region of a vma. madvise_update_vma + * will handle splitting a vm area into separate areas, each area with its own + * behavior. + */ +static int madvise_vma_behavior(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long behavior) +{ + int error; + unsigned long new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); + case MADV_FREE: + case MADV_DONTNEED: + return madvise_dontneed_free(vma, prev, start, end, behavior); + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return madvise_populate(vma, prev, start, end, behavior); + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) + return -EINVAL; + new_flags &= ~VM_DONTCOPY; + break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) + return -EINVAL; + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; + case MADV_DONTDUMP: + new_flags |= VM_DONTDUMP; + break; + case MADV_DODUMP: + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) + return -EINVAL; + new_flags &= ~VM_DONTDUMP; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; + } + + error = madvise_update_vma(vma, prev, start, end, new_flags, + vma_anon_name(vma)); + +out: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. @@ -978,30 +1101,6 @@ static int madvise_inject_error(int behavior, } #endif -static long -madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - return madvise_remove(vma, prev, start, end); - case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); - case MADV_COLD: - return madvise_cold(vma, prev, start, end); - case MADV_PAGEOUT: - return madvise_pageout(vma, prev, start, end); - case MADV_FREE: - case MADV_DONTNEED: - return madvise_dontneed_free(vma, prev, start, end, behavior); - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - return madvise_populate(vma, prev, start, end, behavior); - default: - return madvise_behavior(vma, prev, start, end, behavior); - } -} - static bool madvise_behavior_valid(int behavior) { @@ -1056,6 +1155,122 @@ process_madvise_behavior_valid(int behavior) } /* + * Walk the vmas in range [start,end), and call the visit function on each one. + * The visit function will get start and end parameters that cover the overlap + * between the current vma and the original range. Any unmapped regions in the + * original range will result in this function returning -ENOMEM while still + * calling the visit function on all of the existing vmas in the range. + * Must be called with the mmap_lock held for reading or writing. + */ +static +int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long arg, + int (*visit)(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long arg)) +{ + struct vm_area_struct *vma; + struct vm_area_struct *prev; + unsigned long tmp; + int unmapped_error = 0; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + int error; + + /* Still start < end. */ + if (!vma) + return -ENOMEM; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + break; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = visit(vma, &prev, start, tmp, arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + if (start >= end) + break; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_lock */ + vma = find_vma(mm, start); + } + + return unmapped_error; +} + +#ifdef CONFIG_ANON_VMA_NAME +static int madvise_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long name) +{ + int error; + + /* Only anonymous mappings can be named */ + if (vma->vm_file) + return -EBADF; + + error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, + (const char *)name); + + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) +{ + unsigned long end; + unsigned long len; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, (unsigned long)name, + madvise_vma_anon_name); +} +#endif /* CONFIG_ANON_VMA_NAME */ +/* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should @@ -1127,10 +1342,8 @@ process_madvise_behavior_valid(int behavior) */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { - unsigned long end, tmp; - struct vm_area_struct *vma, *prev; - int unmapped_error = 0; - int error = -EINVAL; + unsigned long end; + int error; int write; size_t len; struct blk_plug plug; @@ -1138,23 +1351,22 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr(start); if (!madvise_behavior_valid(behavior)) - return error; + return -EINVAL; if (!PAGE_ALIGNED(start)) - return error; + return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - return error; + return -EINVAL; end = start + len; if (end < start) - return error; + return -EINVAL; - error = 0; if (end == start) - return error; + return 0; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -1169,51 +1381,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh mmap_read_lock(mm); } - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - * - different from the way of handling in mlock etc. - */ - vma = find_vma_prev(mm, start, &prev); - if (vma && start > vma->vm_start) - prev = vma; - blk_start_plug(&plug); - for (;;) { - /* Still start < end. */ - error = -ENOMEM; - if (!vma) - goto out; - - /* Here start < (end|vma->vm_end). */ - if (start < vma->vm_start) { - unmapped_error = -ENOMEM; - start = vma->vm_start; - if (start >= end) - goto out; - } - - /* Here vma->vm_start <= start < (end|vma->vm_end) */ - tmp = vma->vm_end; - if (end < tmp) - tmp = end; - - /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = madvise_vma(vma, &prev, start, tmp, behavior); - if (error) - goto out; - start = tmp; - if (prev && start < prev->vm_end) - start = prev->vm_end; - error = unmapped_error; - if (start >= end) - goto out; - if (prev) - vma = prev->vm_next; - else /* madvise_remove dropped mmap_lock */ - vma = find_vma(mm, start); - } -out: + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index ea734f248fce..1b0ab8fcfd8b 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -3,6 +3,7 @@ #include <linux/hugetlb.h> #include <linux/bitops.h> #include <linux/mmu_notifier.h> +#include <linux/mm_inline.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4a7b3ebf8e48..09d342c7cbd0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -84,7 +84,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -bool cgroup_memory_nokmem __ro_after_init; +static bool cgroup_memory_nokmem __ro_after_init; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -629,11 +629,17 @@ static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); -static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + unsigned int x; + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); - if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) - atomic_inc(&stats_flush_threshold); + + x = __this_cpu_add_return(stats_updates, abs(val)); + if (x > MEMCG_CHARGE_BATCH) { + atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); + __this_cpu_write(stats_updates, 0); + } } static void __mem_cgroup_flush_stats(void) @@ -656,7 +662,7 @@ void mem_cgroup_flush_stats(void) static void flush_memcg_stats_dwork(struct work_struct *w) { - mem_cgroup_flush_stats(); + __mem_cgroup_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); } @@ -672,7 +678,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -705,7 +711,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); } /** @@ -789,7 +795,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, return; __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, count); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -1369,6 +1375,7 @@ static const struct memory_stat memory_stats[] = { { "pagetables", NR_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, + { "vmalloc", MEMCG_VMALLOC }, { "shmem", NR_SHMEM }, { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@ -4850,6 +4857,17 @@ out_kfree: return ret; } +#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +static int mem_cgroup_slab_show(struct seq_file *m, void *p) +{ + /* + * Deprecated. + * Please, take a look at tools/cgroup/slabinfo.py . + */ + return 0; +} +#endif + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -4950,7 +4968,7 @@ static struct cftype mem_cgroup_legacy_files[] = { (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) { .name = "kmem.slabinfo", - .seq_show = memcg_slab_show, + .seq_show = mem_cgroup_slab_show, }, #endif { @@ -5110,15 +5128,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - unsigned int size; int node; int __maybe_unused i; long error = -ENOMEM; - size = sizeof(struct mem_cgroup); - size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); + memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); if (!memcg) return ERR_PTR(error); @@ -6312,6 +6326,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_KILL])); + seq_printf(m, "oom_group_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); } static int memory_events_show(struct seq_file *m, void *v) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 07c875fdeaf0..14ae5c18e776 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -58,6 +58,7 @@ #include <linux/ratelimit.h> #include <linux/page-isolation.h> #include <linux/pagewalk.h> +#include <linux/shmem_fs.h> #include "internal.h" #include "ras/ras_event.h" @@ -722,7 +723,6 @@ static const char * const action_page_types[] = { [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", [MF_MSG_SLAB] = "kernel slab page", [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", @@ -737,7 +737,6 @@ static const char * const action_page_types[] = { [MF_MSG_CLEAN_LRU] = "clean LRU page", [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", [MF_MSG_BUDDY] = "free buddy page", - [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_UNKNOWN] = "unknown page", @@ -867,6 +866,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; + bool extra_pins; delete_from_lru_cache(p); @@ -896,17 +896,23 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) } /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p); - if (has_extra_refcount(ps, p, false)) - ret = MF_FAILED; - return ret; } @@ -1154,6 +1160,22 @@ static int page_action(struct page_state *ps, struct page *p, return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +static inline bool PageHWPoisonTakenOff(struct page *page) +{ + return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON; +} + +void SetPageHWPoisonTakenOff(struct page *page) +{ + set_page_private(page, MAGIC_HWPOISON); +} + +void ClearPageHWPoisonTakenOff(struct page *page) +{ + if (PageHWPoison(page)) + set_page_private(page, 0); +} + /* * Return true if a page type of a given page is supported by hwpoison * mechanism (while handling could fail), otherwise false. This function @@ -1256,6 +1278,27 @@ out: return ret; } +static int __get_unpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, + * but also isolated from buddy freelist, so need to identify the + * state and have to cancel both operations to unpoison. + */ + if (PageHWPoisonTakenOff(page)) + return -EHWPOISON; + + return get_page_unless_zero(page) ? 1 : 0; +} + /** * get_hwpoison_page() - Get refcount for memory error handling * @p: Raw error page (hit by memory error) @@ -1263,7 +1306,7 @@ out: * * get_hwpoison_page() takes a page refcount of an error page to handle memory * error on it, after checking that the error page is in a well-defined state - * (defined as a page-type we can successfully handle the memor error on it, + * (defined as a page-type we can successfully handle the memory error on it, * such as LRU page and hugetlb page). * * Memory error handling could be triggered at any time on any type of page, @@ -1272,18 +1315,26 @@ out: * extra care for the error page's state (as done in __get_hwpoison_page()), * and has some retry logic in get_any_page(). * + * When called from unpoison_memory(), the caller should already ensure that + * the given page has PG_hwpoison. So it's never reused for other page + * allocations, and __get_unpoison_page() never races with them. + * * Return: 0 on failure, * 1 on success for in-use pages in a well-defined state, * -EIO for pages on which we can not handle memory errors, * -EBUSY when get_hwpoison_page() has raced with page lifecycle - * operations like allocation and free. + * operations like allocation and free, + * -EHWPOISON when the page is hwpoisoned and taken off from buddy. */ static int get_hwpoison_page(struct page *p, unsigned long flags) { int ret; zone_pcp_disable(page_zone(p)); - ret = get_any_page(p, flags); + if (flags & MF_UNPOISON) + ret = __get_unpoison_page(p); + else + ret = get_any_page(p, flags); zone_pcp_enable(page_zone(p)); return ret; @@ -1470,17 +1521,12 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (!(flags & MF_COUNT_INCREASED)) { res = get_hwpoison_page(p, flags); if (!res) { - /* - * Check "filter hit" and "race with other subpage." - */ lock_page(head); - if (PageHWPoison(head)) { - if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != head && TestSetPageHWPoison(head))) { + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(head)) num_poisoned_pages_dec(); - unlock_page(head); - return 0; - } + unlock_page(head); + return 0; } unlock_page(head); res = MF_FAILED; @@ -1499,14 +1545,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) lock_page(head); page_flags = head->flags; - if (!PageHWPoison(head)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(head); - put_page(head); - return 0; - } - /* * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * simply disable it. In order to make it work properly, we need @@ -1620,6 +1658,8 @@ out: return rc; } +static DEFINE_MUTEX(mf_mutex); + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -1646,26 +1686,32 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; - static DEFINE_MUTEX(mf_mutex); if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); + mutex_lock(&mf_mutex); + p = pfn_to_online_page(pfn); if (!p) { + res = arch_memory_failure(pfn, flags); + if (res == 0) + goto unlock_mutex; + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); - if (pgmap) - return memory_failure_dev_pagemap(pfn, flags, - pgmap); + if (pgmap) { + res = memory_failure_dev_pagemap(pfn, flags, + pgmap); + goto unlock_mutex; + } } pr_err("Memory failure: %#lx: memory outside kernel control\n", pfn); - return -ENXIO; + res = -ENXIO; + goto unlock_mutex; } - mutex_lock(&mf_mutex); - try_again: if (PageHuge(p)) { res = memory_failure_hugetlb(pfn, flags); @@ -1780,16 +1826,6 @@ try_again: */ page_flags = p->flags; - /* - * unpoison always clear PG_hwpoison inside page lock - */ - if (!PageHWPoison(p)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(p); - put_page(p); - goto unlock_mutex; - } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); @@ -1953,6 +1989,28 @@ core_initcall(memory_failure_init); pr_info(fmt, pfn); \ }) +static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p) +{ + if (TestClearPageHWPoison(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + num_poisoned_pages_dec(); + return 1; + } + return 0; +} + +static inline int unpoison_taken_off_page(struct ratelimit_state *rs, + struct page *p) +{ + if (put_page_back_buddy(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + return 0; + } + return -EBUSY; +} + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1969,8 +2027,7 @@ int unpoison_memory(unsigned long pfn) { struct page *page; struct page *p; - int freeit = 0; - unsigned long flags = 0; + int ret = -EBUSY; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1980,69 +2037,60 @@ int unpoison_memory(unsigned long pfn) p = pfn_to_page(pfn); page = compound_head(p); + mutex_lock(&mf_mutex); + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_count(page) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapped(page)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapping(page)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); - return 0; - } - - /* - * unpoison_memory() can encounter thp only when the thp is being - * worked by memory_failure() and the page lock is not held yet. - * In such case, we yield to memory_failure() and make unpoison fail. - */ - if (!PageHuge(page) && PageTransHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", - pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } - if (!get_hwpoison_page(p, flags)) { - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", - pfn, &unpoison_rs); - return 0; - } + if (PageSlab(page) || PageTable(page)) + goto unlock_mutex; - lock_page(page); - /* - * This test is racy because PG_hwpoison is set outside of page lock. - * That's acceptable because that won't trigger kernel panic. Instead, - * the PG_hwpoison page will be caught and isolated on the entrance to - * the free buddy page pool. - */ - if (TestClearPageHWPoison(page)) { - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", - pfn, &unpoison_rs); - num_poisoned_pages_dec(); - freeit = 1; - } - unlock_page(page); + ret = get_hwpoison_page(p, MF_UNPOISON); + if (!ret) { + if (clear_page_hwpoison(&unpoison_rs, page)) + ret = 0; + else + ret = -EBUSY; + } else if (ret < 0) { + if (ret == -EHWPOISON) { + ret = unpoison_taken_off_page(&unpoison_rs, p); + } else + unpoison_pr_info("Unpoison: failed to grab page %#lx\n", + pfn, &unpoison_rs); + } else { + int freeit = clear_page_hwpoison(&unpoison_rs, p); - put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + put_page(page); + ret = 0; + } + } - return 0; +unlock_mutex: + mutex_unlock(&mf_mutex); + return ret; } EXPORT_SYMBOL(unpoison_memory); @@ -2223,9 +2271,12 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; } + mutex_lock(&mf_mutex); + if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); put_ref_page(ref_page); + mutex_unlock(&mf_mutex); return 0; } @@ -2239,9 +2290,12 @@ retry: } else if (ret == 0) { if (soft_offline_free_page(page) && try_again) { try_again = false; + flags &= ~MF_COUNT_INCREASED; goto retry; } } + mutex_unlock(&mf_mutex); + return ret; } diff --git a/mm/memory.c b/mm/memory.c index 8f1de811a1dc..f306e698a1e3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -41,6 +41,7 @@ #include <linux/kernel_stat.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> @@ -719,8 +720,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); - set_pte_at(vma->vm_mm, address, ptep, pte); - /* * No need to take a page reference as one was already * created when the swap entry was made. @@ -734,6 +733,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, */ WARN_ON_ONCE(!PageAnon(page)); + set_pte_at(vma->vm_mm, address, ptep, pte); + if (vma->vm_flags & VM_LOCKED) mlock_vma_page(page); @@ -1304,6 +1305,28 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return ret; } +/* + * Parameter block passed down to zap_pte_range in exceptional cases. + */ +struct zap_details { + struct address_space *zap_mapping; /* Check page->mapping if set */ + struct folio *single_folio; /* Locked folio to be unmapped */ +}; + +/* + * We set details->zap_mapping when we want to unmap shared but keep private + * pages. Return true if skip zapping this page, false otherwise. + */ +static inline bool +zap_skip_check_mapping(struct zap_details *details, struct page *page) +{ + if (!details || !page) + return false; + + return details->zap_mapping && + (details->zap_mapping != page_rmapping(page)); +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1443,8 +1466,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ - } else if (details && details->single_page && - PageTransCompound(details->single_page) && + } else if (details && details->single_folio && + folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* @@ -3332,31 +3355,30 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } /** - * unmap_mapping_page() - Unmap single page from processes. - * @page: The locked page to be unmapped. + * unmap_mapping_folio() - Unmap single folio from processes. + * @folio: The locked folio to be unmapped. * - * Unmap this page from any userspace process which still has it mmaped. + * Unmap this folio from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once - * truncation or invalidation holds the lock on a page, it may find that - * the page has been remapped again: and then uses unmap_mapping_page() + * truncation or invalidation holds the lock on a folio, it may find that + * the page has been remapped again: and then uses unmap_mapping_folio() * to unmap it finally. */ -void unmap_mapping_page(struct page *page) +void unmap_mapping_folio(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct zap_details details = { }; pgoff_t first_index; pgoff_t last_index; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageTail(page)); + VM_BUG_ON(!folio_test_locked(folio)); - first_index = page->index; - last_index = page->index + thp_nr_pages(page) - 1; + first_index = folio->index; + last_index = folio->index + folio_nr_pages(folio) - 1; details.zap_mapping = mapping; - details.single_page = page; + details.single_folio = folio; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) @@ -3626,7 +3648,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -3639,8 +3661,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte = pte_mkuffd_wp(pte); pte = pte_wrprotect(pte); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; /* ksm created a completely new copy */ @@ -3651,6 +3671,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) do_page_add_anon_rmap(page, vma, vmf->address, exclusive); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + swap_free(entry); if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 10e9c87260ed..028e8dd82b44 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -134,6 +134,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; * @node: Node id to start the search * * Lookup the next closest node by distance if @nid is not online. + * + * Return: this @node if it is online, otherwise the closest node by distance */ int numa_map_to_online_node(int node) { @@ -296,6 +298,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; + policy->home_node = NUMA_NO_NODE; return policy; } @@ -810,7 +813,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + vma_anon_name(vma)); if (prev) { vma = prev; next = vma->vm_next; @@ -1477,6 +1481,77 @@ static long kernel_mbind(unsigned long start, unsigned long len, return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, + unsigned long, home_node, unsigned long, flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct mempolicy *new; + unsigned long vmstart; + unsigned long vmend; + unsigned long end; + int err = -ENOENT; + + start = untagged_addr(start); + if (start & ~PAGE_MASK) + return -EINVAL; + /* + * flags is used for future extension if any. + */ + if (flags != 0) + return -EINVAL; + + /* + * Check home_node is online to avoid accessing uninitialized + * NODE_DATA. + */ + if (home_node >= MAX_NUMNODES || !node_online(home_node)) + return -EINVAL; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + mmap_write_lock(mm); + vma = find_vma(mm, start); + for (; vma && vma->vm_start < end; vma = vma->vm_next) { + + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + new = mpol_dup(vma_policy(vma)); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + /* + * Only update home node if there is an existing vma policy + */ + if (!new) + continue; + + /* + * If any vma in the range got policy other than MPOL_BIND + * or MPOL_PREFERRED_MANY we return error. We don't reset + * the home node for vmas we already updated before. + */ + if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + err = -EOPNOTSUPP; + break; + } + + new->home_node = home_node; + err = mbind_range(mm, vmstart, vmend, new); + mpol_put(new); + if (err) + break; + } + mmap_write_unlock(mm); + return err; +} + SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) @@ -1801,6 +1876,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + if ((policy->mode == MPOL_BIND || + policy->mode == MPOL_PREFERRED_MANY) && + policy->home_node != NUMA_NO_NODE) + return policy->home_node; + return nd; } @@ -2061,7 +2141,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); if (!page) - page = __alloc_pages(gfp, order, numa_node_id(), NULL); + page = __alloc_pages(gfp, order, nid, NULL); return page; } @@ -2072,7 +2152,6 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * @order: Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual address of the allocation. Must be inside @vma. - * @node: Which node to prefer for allocation (modulo policy). * @hugepage: For hugepages try only the preferred node if possible. * * Allocate a page for a specific address in @vma, using the appropriate @@ -2083,9 +2162,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, bool hugepage) { struct mempolicy *pol; + int node = numa_node_id(); struct page *page; int preferred_nid; nodemask_t *nmask; @@ -2102,6 +2182,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, } if (pol->mode == MPOL_PREFERRED_MANY) { + node = policy_node(gfp, pol, node); page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); goto out; @@ -2140,8 +2221,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * memory with both reclaim and compact as well. */ if (!page && (gfp & __GFP_DIRECT_RECLAIM)) - page = __alloc_pages_node(hpage_node, - gfp, order); + page = __alloc_pages(gfp, order, hpage_node, nmask); goto out; } @@ -2186,7 +2266,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY) page = alloc_pages_preferred_many(gfp, order, - numa_node_id(), pol); + policy_node(gfp, pol, numa_node_id()), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), @@ -2342,6 +2422,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) return false; if (a->flags != b->flags) return false; + if (a->home_node != b->home_node) + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; @@ -2885,7 +2967,7 @@ static const char * const policy_modes[] = * Format of input: * <mode>[=<flags>][:<nodelist>] * - * On success, returns 0, else 1 + * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { diff --git a/mm/memremap.c b/mm/memremap.c index 5a66a71ab591..6aa5f0c2d11f 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -102,39 +102,22 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) return (range->start + range_len(range)) >> PAGE_SHIFT; } -static unsigned long pfn_next(unsigned long pfn) +static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) { - if (pfn % 1024 == 0) + if (pfn % (1024 << pgmap->vmemmap_shift)) cond_resched(); - return pfn + 1; + return pfn + pgmap_vmemmap_nr(pgmap); } -#define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) - -static void dev_pagemap_kill(struct dev_pagemap *pgmap) +static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) { - if (pgmap->ops && pgmap->ops->kill) - pgmap->ops->kill(pgmap); - else - percpu_ref_kill(pgmap->ref); + return (pfn_end(pgmap, range_id) - + pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } -static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->cleanup) { - pgmap->ops->cleanup(pgmap); - } else { - wait_for_completion(&pgmap->done); - percpu_ref_exit(pgmap->ref); - } - /* - * Undo the pgmap ref assignment for the internal case as the - * caller may re-enable the same pgmap. - */ - if (pgmap->ref == &pgmap->internal_ref) - pgmap->ref = NULL; -} +#define for_each_device_pfn(pfn, map, i) \ + for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ + pfn = pfn_next(map, pfn)) static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { @@ -167,11 +150,12 @@ void memunmap_pages(struct dev_pagemap *pgmap) unsigned long pfn; int i; - dev_pagemap_kill(pgmap); + percpu_ref_kill(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) for_each_device_pfn(pfn, pgmap, i) put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); + wait_for_completion(&pgmap->done); + percpu_ref_exit(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) pageunmap_range(pgmap, i); @@ -188,8 +172,7 @@ static void devm_memremap_pages_release(void *data) static void dev_pagemap_percpu_release(struct percpu_ref *ref) { - struct dev_pagemap *pgmap = - container_of(ref, struct dev_pagemap, internal_ref); + struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref); complete(&pgmap->done); } @@ -295,8 +278,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id) - - pfn_first(pgmap, range_id)); + percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); return 0; err_add_memory: @@ -362,22 +344,11 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) break; } - if (!pgmap->ref) { - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) - return ERR_PTR(-EINVAL); - - init_completion(&pgmap->done); - error = percpu_ref_init(&pgmap->internal_ref, - dev_pagemap_percpu_release, 0, GFP_KERNEL); - if (error) - return ERR_PTR(error); - pgmap->ref = &pgmap->internal_ref; - } else { - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); - } - } + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0, + GFP_KERNEL); + if (error) + return ERR_PTR(error); devmap_managed_enable_get(pgmap); @@ -486,7 +457,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); - if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + if (pgmap && !percpu_ref_tryget_live(&pgmap->ref)) pgmap = NULL; rcu_read_unlock(); diff --git a/mm/migrate.c b/mm/migrate.c index cf25b00f03c8..18ce840914f0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -50,6 +50,7 @@ #include <linux/ptrace.h> #include <linux/oom.h> #include <linux/memory.h> +#include <linux/random.h> #include <asm/tlbflush.h> @@ -236,20 +237,19 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); - set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } else #endif { - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else page_add_file_rmap(new, false); + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); @@ -291,7 +291,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, { pte_t pte; swp_entry_t entry; - struct page *page; + struct folio *folio; spin_lock(ptl); pte = *ptep; @@ -302,18 +302,17 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - page = pfn_swap_entry_to_page(entry); - page = compound_head(page); + folio = page_folio(pfn_swap_entry_to_page(entry)); /* * Once page cache replacement of page migration started, page_count - * is zero; but we must not call put_and_wait_on_page_locked() without - * a ref. Use get_page_unless_zero(), and just fault again if it fails. + * is zero; but we must not call folio_put_wait_locked() without + * a ref. Use folio_try_get(), and just fault again if it fails. */ - if (!get_page_unless_zero(page)) + if (!folio_try_get(folio)) goto out; pte_unmap_unlock(ptep, ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); return; out: pte_unmap_unlock(ptep, ptl); @@ -338,16 +337,16 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - struct page *page; + struct folio *folio; ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)); - if (!get_page_unless_zero(page)) + folio = page_folio(pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd))); + if (!folio_try_get(folio)) goto unlock; spin_unlock(ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); return; unlock: spin_unlock(ptl); @@ -434,14 +433,6 @@ int folio_migrate_mapping(struct address_space *mapping, } xas_store(&xas, newfolio); - if (nr > 1) { - int i; - - for (i = 1; i < nr; i++) { - xas_next(&xas); - xas_store(&xas, newfolio); - } - } /* * Drop cache reference from old page by unfreezing @@ -1093,80 +1084,6 @@ out: return rc; } - -/* - * node_demotion[] example: - * - * Consider a system with two sockets. Each socket has - * three classes of memory attached: fast, medium and slow. - * Each memory class is placed in its own NUMA node. The - * CPUs are placed in the node with the "fast" memory. The - * 6 NUMA nodes (0-5) might be split among the sockets like - * this: - * - * Socket A: 0, 1, 2 - * Socket B: 3, 4, 5 - * - * When Node 0 fills up, its memory should be migrated to - * Node 1. When Node 1 fills up, it should be migrated to - * Node 2. The migration path start on the nodes with the - * processors (since allocations default to this node) and - * fast memory, progress through medium and end with the - * slow memory: - * - * 0 -> 1 -> 2 -> stop - * 3 -> 4 -> 5 -> stop - * - * This is represented in the node_demotion[] like this: - * - * { 1, // Node 0 migrates to 1 - * 2, // Node 1 migrates to 2 - * -1, // Node 2 does not migrate - * 4, // Node 3 migrates to 4 - * 5, // Node 4 migrates to 5 - * -1} // Node 5 does not migrate - */ - -/* - * Writes to this array occur without locking. Cycles are - * not allowed: Node X demotes to Y which demotes to X... - * - * If multiple reads are performed, a single rcu_read_lock() - * must be held over all reads to ensure that no cycles are - * observed. - */ -static int node_demotion[MAX_NUMNODES] __read_mostly = - {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; - -/** - * next_demotion_node() - Get the next node in the demotion path - * @node: The starting node to lookup the next node - * - * Return: node id for next memory node in the demotion path hierarchy - * from @node; NUMA_NO_NODE if @node is terminal. This does not keep - * @node online or guarantee that it *continues* to be the next demotion - * target. - */ -int next_demotion_node(int node) -{ - int target; - - /* - * node_demotion[] is updated without excluding this - * function from running. RCU doesn't provide any - * compiler barriers, so the READ_ONCE() is required - * to avoid compiler reordering or read merging. - * - * Make sure to use RCU over entire code blocks if - * node_demotion[] reads need to be consistent. - */ - rcu_read_lock(); - target = READ_ONCE(node_demotion[node]); - rcu_read_unlock(); - - return target; -} - /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -1422,7 +1339,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. - * @ret_succeeded: Set to the number of pages migrated successfully if + * @ret_succeeded: Set to the number of normal pages migrated successfully if * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more @@ -1430,7 +1347,9 @@ static inline int try_split_thp(struct page *page, struct page **page2, * It is caller's responsibility to call putback_movable_pages() to return pages * to the LRU or free list only if ret != 0. * - * Returns the number of pages that were not migrated, or an error code. + * Returns the number of {normal page, THP, hugetlb} that were not migrated, or + * an error code. The number of THP splits will be considered as the number of + * non-migrated THP, no matter how many subpages of the THP are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, @@ -1439,6 +1358,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int retry = 1; int thp_retry = 1; int nr_failed = 0; + int nr_failed_pages = 0; int nr_succeeded = 0; int nr_thp_succeeded = 0; int nr_thp_failed = 0; @@ -1450,13 +1370,16 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; LIST_HEAD(ret_pages); + LIST_HEAD(thp_split_pages); bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_subpage_counting = false; trace_mm_migrate_pages_start(mode, reason); if (!swapwrite) current->flags |= PF_SWAPWRITE; +thp_subpage_migration: for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { retry = 0; thp_retry = 0; @@ -1469,7 +1392,7 @@ retry: * during migration. */ is_thp = PageTransHuge(page) && !PageHuge(page); - nr_subpages = thp_nr_pages(page); + nr_subpages = compound_nr(page); cond_resched(); if (PageHuge(page)) @@ -1505,18 +1428,20 @@ retry: case -ENOSYS: /* THP migration is unsupported */ if (is_thp) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } /* Hugetlb migration is unsupported */ - nr_failed++; + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; break; case -ENOMEM: /* @@ -1525,16 +1450,19 @@ retry: * THP NUMA faulting doesn't split THP to retry. */ if (is_thp && !nosplit) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; goto out; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; goto out; case -EAGAIN: if (is_thp) { @@ -1544,12 +1472,11 @@ retry: retry++; break; case MIGRATEPAGE_SUCCESS: + nr_succeeded += nr_subpages; if (is_thp) { nr_thp_succeeded++; - nr_succeeded += nr_subpages; break; } - nr_succeeded++; break; default: /* @@ -1560,17 +1487,37 @@ retry: */ if (is_thp) { nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; break; } } } - nr_failed += retry + thp_retry; + nr_failed += retry; nr_thp_failed += thp_retry; - rc = nr_failed; + /* + * Try to migrate subpages of fail-to-migrate THPs, no nr_failed + * counting in this round, since all subpages of a THP is counted + * as 1 failure in the first round. + */ + if (!list_empty(&thp_split_pages)) { + /* + * Move non-migrated pages (after 10 retries) to ret_pages + * to avoid migrating them again. + */ + list_splice_init(from, &ret_pages); + list_splice_init(&thp_split_pages, from); + no_subpage_counting = true; + retry = 1; + goto thp_subpage_migration; + } + + rc = nr_failed + nr_thp_failed; out: /* * Put the permanent failure page back to migration list, they @@ -1579,11 +1526,11 @@ out: list_splice(&ret_pages, from); count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - count_vm_events(PGMIGRATE_FAIL, nr_failed); + count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); - trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, + trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, nr_thp_failed, nr_thp_split, mode, reason); if (!swapwrite) @@ -2525,8 +2472,7 @@ static bool migrate_vma_check_page(struct page *page) static void migrate_vma_unmap(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; - unsigned long addr, i, restore = 0; + unsigned long i, restore = 0; bool allow_drain = true; lru_add_drain(); @@ -2572,7 +2518,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) } } - for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { + for (i = 0; i < npages && restore; i++) { struct page *page = migrate_pfn_to_page(migrate->src[i]); if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) @@ -2970,14 +2916,152 @@ void migrate_vma_finalize(struct migrate_vma *migrate) EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ +/* + * node_demotion[] example: + * + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 + * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 + * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate + * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 + * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 + * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate + * + * Moreover some systems may have multiple slow memory nodes. + * Suppose a system has one socket with 3 memory nodes, node 0 + * is fast memory type, and node 1/2 both are slow memory + * type, and the distance between fast memory node and slow + * memory node is same. So the migration path should be: + * + * 0 -> 1/2 -> stop + * + * This is represented in the node_demotion[] like this: + * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 + * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate + * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate + */ + +/* + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ +#define DEFAULT_DEMOTION_TARGET_NODES 15 + +#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES +#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) +#else +#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES +#endif + +struct demotion_nodes { + unsigned short nr; + short nodes[DEMOTION_TARGET_NODES]; +}; + +static struct demotion_nodes *node_demotion __read_mostly; + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + struct demotion_nodes *nd; + unsigned short target_nr, index; + int target; + + if (!node_demotion) + return NUMA_NO_NODE; + + nd = &node_demotion[node]; + + /* + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + target_nr = READ_ONCE(nd->nr); + + switch (target_nr) { + case 0: + target = NUMA_NO_NODE; + goto out; + case 1: + index = 0; + break; + default: + /* + * If there are multiple target nodes, just select one + * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. + */ + index = get_random_int() % target_nr; + break; + } + + target = READ_ONCE(nd->nodes[index]); + +out: + rcu_read_unlock(); + return target; +} + #if defined(CONFIG_HOTPLUG_CPU) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { - int node; + int node, i; + + if (!node_demotion) + return; - for_each_online_node(node) - node_demotion[node] = NUMA_NO_NODE; + for_each_online_node(node) { + node_demotion[node].nr = 0; + for (i = 0; i < DEMOTION_TARGET_NODES; i++) + node_demotion[node].nodes[i] = NUMA_NO_NODE; + } } static void disable_all_migrate_targets(void) @@ -3004,26 +3088,40 @@ static void disable_all_migrate_targets(void) * Failing here is OK. It might just indicate * being at the end of a chain. */ -static int establish_migrate_target(int node, nodemask_t *used) +static int establish_migrate_target(int node, nodemask_t *used, + int best_distance) { - int migration_target; + int migration_target, index, val; + struct demotion_nodes *nd; - /* - * Can not set a migration target on a - * node with it already set. - * - * No need for READ_ONCE() here since this - * in the write path for node_demotion[]. - * This should be the only thread writing. - */ - if (node_demotion[node] != NUMA_NO_NODE) + if (!node_demotion) return NUMA_NO_NODE; + nd = &node_demotion[node]; + migration_target = find_next_best_node(node, used); if (migration_target == NUMA_NO_NODE) return NUMA_NO_NODE; - node_demotion[node] = migration_target; + /* + * If the node has been set a migration target node before, + * which means it's the best distance between them. Still + * check if this node can be demoted to other target nodes + * if they have a same best distance. + */ + if (best_distance != -1) { + val = node_distance(node, migration_target); + if (val > best_distance) + return NUMA_NO_NODE; + } + + index = nd->nr; + if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, + "Exceeds maximum demotion target nodes\n")) + return NUMA_NO_NODE; + + nd->nodes[index] = migration_target; + nd->nr++; return migration_target; } @@ -3039,7 +3137,9 @@ static int establish_migrate_target(int node, nodemask_t *used) * * The difference here is that cycles must be avoided. If * node0 migrates to node1, then neither node1, nor anything - * node1 migrates to can migrate to node0. + * node1 migrates to can migrate to node0. Also one node can + * be migrated to multiple nodes if the target nodes all have + * a same best-distance against the source node. * * This function can run simultaneously with readers of * node_demotion[]. However, it can not run simultaneously @@ -3051,7 +3151,7 @@ static void __set_migration_target_nodes(void) nodemask_t next_pass = NODE_MASK_NONE; nodemask_t this_pass = NODE_MASK_NONE; nodemask_t used_targets = NODE_MASK_NONE; - int node; + int node, best_distance; /* * Avoid any oddities like cycles that could occur @@ -3080,18 +3180,33 @@ again: * multiple source nodes to share a destination. */ nodes_or(used_targets, used_targets, this_pass); - for_each_node_mask(node, this_pass) { - int target_node = establish_migrate_target(node, &used_targets); - if (target_node == NUMA_NO_NODE) - continue; + for_each_node_mask(node, this_pass) { + best_distance = -1; /* - * Visit targets from this pass in the next pass. - * Eventually, every node will have been part of - * a pass, and will become set in 'used_targets'. + * Try to set up the migration path for the node, and the target + * migration nodes can be multiple, so doing a loop to find all + * the target nodes if they all have a best node distance. */ - node_set(target_node, next_pass); + do { + int target_node = + establish_migrate_target(node, &used_targets, + best_distance); + + if (target_node == NUMA_NO_NODE) + break; + + if (best_distance == -1) + best_distance = node_distance(node, target_node); + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } while (1); } /* * 'next_pass' contains nodes which became migration @@ -3192,6 +3307,11 @@ static int __init migrate_on_reclaim_init(void) { int ret; + node_demotion = kmalloc_array(nr_node_ids, + sizeof(struct demotion_nodes), + GFP_KERNEL); + WARN_ON(!node_demotion); + ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline", NULL, migration_offline_cpu); /* diff --git a/mm/mlock.c b/mm/mlock.c index e263d62ae2d0..8f584eddd305 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index bfb0ea164a90..1e8fdb0b51ed 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/vmacache.h> #include <linux/shm.h> #include <linux/mman.h> @@ -1029,7 +1030,8 @@ again: */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1047,6 +1049,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (!is_same_vma_anon_name(vma, anon_name)) + return 0; return 1; } @@ -1079,9 +1083,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1100,9 +1105,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1113,9 +1119,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1160,7 +1166,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1190,7 +1197,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1199,7 +1206,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1222,7 +1229,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); @@ -1754,7 +1761,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -1803,7 +1810,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* ->mmap() can change vma->vm_file and fput the original file. So * fput the vma->vm_file here or we would add an extra fput for file @@ -2928,7 +2935,6 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { addr = untagged_addr(addr); - profile_munmap(addr); return __vm_munmap(addr, len, true); } @@ -3056,7 +3062,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3142,25 +3148,27 @@ void exit_mmap(struct mm_struct *mm) * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. * - * This needs to be done before calling munlock_vma_pages_all(), + * This needs to be done before calling unlock_range(), * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); - mmap_write_lock(mm); - mmap_write_unlock(mm); } + mmap_write_lock(mm); if (mm->locked_vm) unlock_range(mm->mmap, ULONG_MAX); arch_exit_mmap(mm); vma = mm->mmap; - if (!vma) /* Can happen if dup_mmap() received an OOM */ + if (!vma) { + /* Can happen if dup_mmap() received an OOM */ + mmap_write_unlock(mm); return; + } lru_add_drain(); flush_cache_mm(mm); @@ -3171,16 +3179,14 @@ void exit_mmap(struct mm_struct *mm) free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* - * Walk the list again, actually closing and freeing it, - * with preemption enabled, without holding any MM locks. - */ + /* Walk the list again, actually closing and freeing it. */ while (vma) { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); cond_resched(); } + mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3249,7 +3255,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 1b9837419bf9..afb7185ffdc4 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -3,6 +3,7 @@ #include <linux/kernel.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> +#include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/rcupdate.h> #include <linux/smp.h> diff --git a/mm/mprotect.c b/mm/mprotect.c index e552f5e0ccbd..0138dfcdb1d8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1ddabefcfb5a..832fb330376e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -793,7 +793,7 @@ static inline bool __task_will_free_mem(struct task_struct *task) * coredump_task_exit(), so the oom killer cannot assume that * the process will promptly exit and release memory. */ - if (sig->flags & SIGNAL_GROUP_COREDUMP) + if (sig->core_state) return false; if (sig->flags & SIGNAL_GROUP_EXIT) @@ -994,6 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * If necessary, kill all tasks in the selected memory cgroup. */ if (oom_group) { + memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL); mem_cgroup_print_oom_group(oom_group); mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, (void *)message); @@ -1057,7 +1058,7 @@ bool out_of_memory(struct oom_control *oc) if (!is_memcg_oom(oc)) { blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) + if (freed > 0 && !is_sysrq_oom(oc)) /* Got some memory back in the last second. */ return true; } @@ -1169,15 +1170,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) goto put_task; } - if (mmget_not_zero(p->mm)) { - mm = p->mm; - if (task_will_free_mem(p)) - reap = true; - else { - /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) - ret = -EINVAL; - } + mm = p->mm; + mmgrab(mm); + + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; } task_unlock(p); @@ -1188,13 +1189,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) ret = -EINTR; goto drop_mm; } - if (!__oom_reap_task_mm(mm)) + /* + * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure + * possible change in exit_mmap is seen + */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); drop_mm: - if (mm) - mmput(mm); + mmdrop(mm); put_task: put_task_struct(task); return ret; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a613f8ef6a02..91d163f8d36b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2496,7 +2496,11 @@ void folio_account_cleaned(struct folio *folio, struct address_space *mapping, * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold lock_page_memcg(). + * The caller must hold lock_page_memcg(). Most callers have the folio + * locked. A few have the folio blocked from truncation through other + * means (eg zap_page_range() has it mapped and is holding the page table + * lock). This can also be called from mark_buffer_dirty(), which I + * cannot prove is always protected against truncate. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5952749ad40..d4205e5e41d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <linux/interrupt.h> #include <linux/pagemap.h> #include <linux/jiffies.h> @@ -63,6 +64,7 @@ #include <linux/sched/rt.h> #include <linux/sched/mm.h> #include <linux/page_owner.h> +#include <linux/page_table_check.h> #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> @@ -726,23 +728,33 @@ void free_compound_page(struct page *page) free_the_page(page, compound_order(page)); } +static void prep_compound_head(struct page *page, unsigned int order) +{ + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + set_compound_order(page, order); + atomic_set(compound_mapcount_ptr(page), -1); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); +} + +static void prep_compound_tail(struct page *head, int tail_idx) +{ + struct page *p = head + tail_idx; + + p->mapping = TAIL_MAPPING; + set_compound_head(p, head); +} + void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - p->mapping = TAIL_MAPPING; - set_compound_head(p, page); - } + for (i = 1; i < nr_pages; i++) + prep_compound_tail(page, i); - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); - set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + prep_compound_head(page, order); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -1297,6 +1309,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); + page_table_check_free(page, order); return false; } @@ -1336,6 +1349,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + page_table_check_free(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -2410,6 +2424,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } set_page_owner(page, order, gfp_flags); + page_table_check_alloc(page, order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -4204,7 +4219,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || + !__ratelimit(&nopage_rs) || + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) return; va_start(args, fmt); @@ -6562,6 +6579,75 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone } #ifdef CONFIG_ZONE_DEVICE +static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap) +{ + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->zone_device_data = NULL; + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap + * because this is done early in section_activate() + */ + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } +} + +static void __ref memmap_init_compound(struct page *head, + unsigned long head_pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap, + unsigned long nr_pages) +{ + unsigned long pfn, end_pfn = head_pfn + nr_pages; + unsigned int order = pgmap->vmemmap_shift; + + __SetPageHead(head); + for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + prep_compound_tail(head, pfn - head_pfn); + set_page_count(page, 0); + + /* + * The first tail page stores compound_mapcount_ptr() and + * compound_order() and the second tail page stores + * compound_pincount_ptr(). Call prep_compound_head() after + * the first and second tail pages have been initialized to + * not have the data overwritten. + */ + if (pfn == head_pfn + 2) + prep_compound_head(head, order); + } +} + void __ref memmap_init_zone_device(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, @@ -6570,6 +6656,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long pfn, end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; struct vmem_altmap *altmap = pgmap_altmap(pgmap); + unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; @@ -6587,42 +6674,16 @@ void __ref memmap_init_zone_device(struct zone *zone, nr_pages = end_pfn - start_pfn; } - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) { struct page *page = pfn_to_page(pfn); - __init_single_page(page, pfn, zone_idx, nid); - - /* - * Mark page reserved as it will need to wait for onlining - * phase for it to be fully associated with a zone. - * - * We can use the non-atomic __set_bit operation for setting - * the flag as we are still initializing the pages. - */ - __SetPageReserved(page); + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer - * and zone_device_data. It is a bug if a ZONE_DEVICE page is - * ever freed or placed on a driver-private list. - */ - page->pgmap = pgmap; - page->zone_device_data = NULL; + if (pfns_per_compound == 1) + continue; - /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * Please note that MEMINIT_HOTPLUG path doesn't clear memmap - * because this is done early in section_activate() - */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - cond_resched(); - } + memmap_init_compound(page, pfn, zone_idx, nid, pgmap, + pfns_per_compound); } pr_info("%s initialised %lu pages in %ums\n", __func__, @@ -8170,7 +8231,7 @@ void __init mem_init_print_info(void) */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) @@ -9214,8 +9275,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, * for allocation requests which can not be fulfilled with the buddy allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a - * power of two then the alignment is guaranteed to be to the given nr_pages - * (e.g. 1GB request would be aligned to 1GB). + * power of two, then allocated range is also guaranteed to be aligned to same + * nr_pages (e.g. 1GB request would be aligned to 1GB). * * Allocated pages can be freed with free_contig_range() or by manually calling * __free_page() on each allocated page. @@ -9448,6 +9509,7 @@ bool take_page_off_buddy(struct page *page) del_page_from_free_list(page_head, zone, page_order); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); + SetPageHWPoisonTakenOff(page); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, -1, migratetype); ret = true; @@ -9459,4 +9521,44 @@ bool take_page_off_buddy(struct page *page) spin_unlock_irqrestore(&zone->lock, flags); return ret; } + +/* + * Cancel takeoff done by take_page_off_buddy(). + */ +bool put_page_back_buddy(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int migratetype = get_pfnblock_migratetype(page, pfn); + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + if (put_page_testzero(page)) { + ClearPageHWPoisonTakenOff(page); + __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); + if (TestClearPageHWPoison(page)) { + num_poisoned_pages_dec(); + ret = true; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + return ret; +} #endif + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; + + if (managed_zone(zone)) + return true; + } + return false; +} +#endif /* CONFIG_ZONE_DMA */ diff --git a/mm/page_counter.c b/mm/page_counter.c index 7d83641eb86b..eb156ff5d603 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -120,7 +120,6 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); - propagate_protected_usage(c, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used diff --git a/mm/page_ext.c b/mm/page_ext.c index 6242afb24d84..2e66d934d63f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -8,6 +8,7 @@ #include <linux/kmemleak.h> #include <linux/page_owner.h> #include <linux/page_idle.h> +#include <linux/page_table_check.h> /* * struct page extension @@ -63,18 +64,21 @@ static bool need_page_idle(void) { return true; } -struct page_ext_operations page_idle_ops = { +static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, }; #endif -static struct page_ext_operations *page_ext_ops[] = { +static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_PAGE_TABLE_CHECK + &page_table_check_ops, +#endif }; unsigned long page_ext_size = sizeof(struct page_ext); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f67c4c70f17f..6a0ddda6b3c5 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * onlining - just onlined memory won't immediately be considered for * allocation. */ - if (!isolated_page) { + if (!isolated_page && PageBuddy(page)) { nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f924957ce7a..5eea061bb1e5 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -46,7 +46,7 @@ static int __init early_page_owner_param(char *buf) } early_param("page_owner", early_page_owner_param); -static bool need_page_owner(void) +static __init bool need_page_owner(void) { return page_owner_enabled; } @@ -75,7 +75,7 @@ static noinline void register_early_stack(void) early_handle = create_dummy_stack(); } -static void init_page_owner(void) +static __init void init_page_owner(void) { if (!page_owner_enabled) return; diff --git a/mm/page_table_check.c b/mm/page_table_check.c new file mode 100644 index 000000000000..7504e7caa2a1 --- /dev/null +++ b/mm/page_table_check.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ +#include <linux/mm.h> +#include <linux/page_table_check.h> + +#undef pr_fmt +#define pr_fmt(fmt) "page_table_check: " fmt + +struct page_table_check { + atomic_t anon_map_count; + atomic_t file_map_count; +}; + +static bool __page_table_check_enabled __initdata = + IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); + +DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); +EXPORT_SYMBOL(page_table_check_disabled); + +static int __init early_page_table_check_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + __page_table_check_enabled = true; + else if (strcmp(buf, "off") == 0) + __page_table_check_enabled = false; + + return 0; +} + +early_param("page_table_check", early_page_table_check_param); + +static bool __init need_page_table_check(void) +{ + return __page_table_check_enabled; +} + +static void __init init_page_table_check(void) +{ + if (!__page_table_check_enabled) + return; + static_branch_disable(&page_table_check_disabled); +} + +struct page_ext_operations page_table_check_ops = { + .size = sizeof(struct page_table_check), + .need = need_page_table_check, + .init = init_page_table_check, +}; + +static struct page_table_check *get_page_table_check(struct page_ext *page_ext) +{ + BUG_ON(!page_ext); + return (void *)(page_ext) + page_table_check_ops.offset; +} + +static inline bool pte_user_accessible_page(pte_t pte) +{ + return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && + (pmd_val(pmd) & _PAGE_USER); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && + (pud_val(pud) & _PAGE_USER); +} + +/* + * An enty is removed from the page table, decrement the counters for that page + * verify that it is of correct type and counters do not become negative. + */ +static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt) +{ + struct page_ext *page_ext; + struct page *page; + bool anon; + int i; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * A new enty is added to the page table, increment the counters for that page + * verify that it is of correct type and is not being mapped with a different + * type to a different process. + */ +static void page_table_check_set(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt, + bool rw) +{ + struct page_ext *page_ext; + struct page *page; + bool anon; + int i; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * page is on free list, or is being allocated, verify that counters are zeroes + * crash if they are not. + */ +void __page_table_check_zero(struct page *page, unsigned int order) +{ + struct page_ext *page_ext = lookup_page_ext(page); + int i; + + BUG_ON(!page_ext); + for (i = 0; i < (1 << order); i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_read(&ptc->file_map_count)); + page_ext = page_ext_next(page_ext); + } +} + +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) +{ + if (&init_mm == mm) + return; + + if (pte_user_accessible_page(pte)) { + page_table_check_clear(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pte_clear); + +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (pmd_user_accessible_page(pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_clear); + +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) +{ + if (&init_mm == mm) + return; + + if (pud_user_accessible_page(pud)) { + page_table_check_clear(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pud_clear); + +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + pte_t old_pte; + + if (&init_mm == mm) + return; + + old_pte = *ptep; + if (pte_user_accessible_page(old_pte)) { + page_table_check_clear(mm, addr, pte_pfn(old_pte), + PAGE_SIZE >> PAGE_SHIFT); + } + + if (pte_user_accessible_page(pte)) { + page_table_check_set(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT, + pte_write(pte)); + } +} +EXPORT_SYMBOL(__page_table_check_pte_set); + +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old_pmd; + + if (&init_mm == mm) + return; + + old_pmd = *pmdp; + if (pmd_user_accessible_page(old_pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(old_pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT); + } + + if (pmd_user_accessible_page(pmd)) { + page_table_check_set(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT, + pmd_write(pmd)); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_set); + +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + pud_t old_pud; + + if (&init_mm == mm) + return; + + old_pud = *pudp; + if (pud_user_accessible_page(old_pud)) { + page_table_check_clear(mm, addr, pud_pfn(old_pud), + PUD_PAGE_SIZE >> PAGE_SHIFT); + } + + if (pud_user_accessible_page(pud)) { + page_table_check_set(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT, + pud_write(pud)); + } +} +EXPORT_SYMBOL(__page_table_check_pud_set); diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 639662c20c82..411d1593ef23 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -113,6 +113,24 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } +#ifdef CONFIG_MEMCG_KMEM +/** + * pcpu_obj_full_size - helper to calculate size of each accounted object + * @size: size of area to allocate in bytes + * + * For each accounted object there is an extra space which is used to store + * obj_cgroup membership. Charge it too. + */ +static inline size_t pcpu_obj_full_size(size_t size) +{ + size_t extra_size; + + extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); + + return size * num_possible_cpus() + extra_size; +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> diff --git a/mm/percpu.c b/mm/percpu.c index f5b2c2ea5a54..4199a0604c32 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1635,7 +1635,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, if (!objcg) return true; - if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) { + if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { obj_cgroup_put(objcg); return false; } @@ -1656,10 +1656,10 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - size * num_possible_cpus()); + pcpu_obj_full_size(size)); rcu_read_unlock(); } else { - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); obj_cgroup_put(objcg); } } @@ -1676,11 +1676,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) return; chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - -(size * num_possible_cpus())); + -pcpu_obj_full_size(size)); rcu_read_unlock(); obj_cgroup_put(objcg); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4e640baf9794..6523fda274e5 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> +#include <linux/mm_inline.h> #include <asm/tlb.h> /* diff --git a/mm/readahead.c b/mm/readahead.c index 6ae5693de28c..cf0dcf89eb69 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -196,9 +196,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * Preallocate as many pages as we will need. */ for (i = 0; i < nr_to_read; i++) { - struct page *page = xa_load(&mapping->i_pages, index + i); + struct folio *folio = xa_load(&mapping->i_pages, index + i); - if (page && !xa_is_value(page)) { + if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the @@ -212,21 +212,21 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) break; if (mapping->a_ops->readpages) { - page->index = index + i; - list_add(&page->lru, &page_pool); - } else if (add_to_page_cache_lru(page, mapping, index + i, + folio->index = index + i; + list_add(&folio->lru, &page_pool); + } else if (filemap_add_folio(mapping, folio, index + i, gfp_mask) < 0) { - put_page(page); + folio_put(folio); read_pages(ractl, &page_pool, true); i = ractl->_index + ractl->_nr_pages - index - 1; continue; } if (i == nr_to_read - lookahead_size) - SetPageReadahead(page); + folio_set_readahead(folio); ractl->_nr_pages++; } @@ -581,7 +581,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, - struct page *page, unsigned long req_count) + struct folio *folio, unsigned long req_count) { /* no read-ahead */ if (!ractl->ra->ra_pages) @@ -590,10 +590,10 @@ void page_cache_async_ra(struct readahead_control *ractl, /* * Same bit is used for PG_readahead and PG_reclaim. */ - if (PageWriteback(page)) + if (folio_test_writeback(folio)) return; - ClearPageReadahead(page); + folio_clear_readahead(folio); /* * Defer asynchronous read-ahead on IO congestion. diff --git a/mm/rmap.c b/mm/rmap.c index 163ac4e6bcee..6a1e8c7f6213 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -621,9 +621,20 @@ void try_to_unmap_flush_dirty(void) try_to_unmap_flush(); } +/* + * Bits 0-14 of mm->tlb_flush_batched record pending generations. + * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. + */ +#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 +#define TLB_FLUSH_BATCH_PENDING_MASK \ + ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) +#define TLB_FLUSH_BATCH_PENDING_LARGE \ + (TLB_FLUSH_BATCH_PENDING_MASK / 2) + static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int batch, nbatch; arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; @@ -633,7 +644,22 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) * before the PTE is cleared. */ barrier(); - mm->tlb_flush_batched = true; + batch = atomic_read(&mm->tlb_flush_batched); +retry: + if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { + /* + * Prevent `pending' from catching up with `flushed' because of + * overflow. Reset `pending' and `flushed' to be 1 and 0 if + * `pending' becomes large. + */ + nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1); + if (nbatch != batch) { + batch = nbatch; + goto retry; + } + } else { + atomic_inc(&mm->tlb_flush_batched); + } /* * If the PTE was dirty then it's best to assume it's writable. The @@ -680,15 +706,18 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ void flush_tlb_batched_pending(struct mm_struct *mm) { - if (data_race(mm->tlb_flush_batched)) { - flush_tlb_mm(mm); + int batch = atomic_read(&mm->tlb_flush_batched); + int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; + int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; + if (pending != flushed) { + flush_tlb_mm(mm); /* - * Do not allow the compiler to re-order the clearing of - * tlb_flush_batched before the tlb is flushed. + * If the new TLB flushing is pending during flushing, leave + * mm->tlb_flush_batched as is, to avoid losing flushing. */ - barrier(); - mm->tlb_flush_batched = false; + atomic_cmpxchg(&mm->tlb_flush_batched, batch, + pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); } } #else diff --git a/mm/shmem.c b/mm/shmem.c index 18f93c2d68f1..66909efd0a1b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -554,7 +554,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shmem_inode_info *info; struct page *page; unsigned long batch = sc ? sc->nr_to_scan : 128; - int removed = 0, split = 0; + int split = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -569,7 +569,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; } @@ -577,12 +576,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; } list_move(&info->shrinklist, &list); next: + sbinfo->shrinklist_len--; if (!--batch) break; } @@ -602,7 +601,7 @@ next: inode = &info->vfs_inode; if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back; page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); @@ -616,38 +615,44 @@ next: } /* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page); - goto leave; + goto move_back; } ret = split_huge_page(page); unlock_page(page); put_page(page); - /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back; split++; drop: list_del_init(&info->shrinklist); - removed++; -leave: + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: iput(inode); } - spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; } @@ -694,7 +699,6 @@ static int shmem_add_to_page_cache(struct page *page, struct mm_struct *charge_mm) { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); - unsigned long i = 0; unsigned long nr = compound_nr(page); int error; @@ -721,20 +725,18 @@ static int shmem_add_to_page_cache(struct page *page, cgroup_throttle_swaprate(page, gfp); do { - void *entry; xas_lock_irq(&xas); - entry = xas_find_conflict(&xas); - if (entry != expected) + if (expected != xas_find_conflict(&xas)) { xas_set_err(&xas, -EEXIST); - xas_create_range(&xas); - if (xas_error(&xas)) goto unlock; -next: - xas_store(&xas, page); - if (++i < nr) { - xas_next(&xas); - goto next; } + if (expected && xas_find_conflict(&xas)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr); @@ -880,30 +882,26 @@ void shmem_unlock_mapping(struct address_space *mapping) } } -/* - * Check whether a hole-punch or truncation needs to split a huge page, - * returning true if no split was required, or the split has been successful. - * - * Eviction (or truncation to 0 size) should never need to split a huge page; - * but in rare cases might do so, if shmem_undo_range() failed to trylock on - * head, and then succeeded to trylock on tail. - * - * A split can only succeed when there are no additional references on the - * huge page: so the split below relies upon find_get_entries() having stopped - * when it found a subpage of the huge page, without getting further references. - */ -static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) +static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { - if (!PageTransCompound(page)) - return true; - - /* Just proceed to delete a huge page wholly within the range punched */ - if (PageHead(page) && - page->index >= start && page->index + HPAGE_PMD_NR <= end) - return true; + struct folio *folio; + struct page *page; - /* Try to split huge page, so we can truly punch the hole or truncate */ - return split_huge_page(page) >= 0; + /* + * At first avoid shmem_getpage(,,,SGP_READ): that fails + * beyond i_size, and reports fallocated pages as holes. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_ENTRY | FGP_LOCK, 0); + if (!xa_is_value(folio)) + return folio; + /* + * But read a page back from swap if any of it is within i_size + * (although in some cases this is just a waste of time). + */ + page = NULL; + shmem_getpage(inode, index, &page, SGP_READ); + return page ? page_folio(page) : NULL; } /* @@ -917,10 +915,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; - unsigned int partial_start = lstart & (PAGE_SIZE - 1); - unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; + struct folio *folio; + bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; @@ -931,67 +929,64 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (info->fallocend > start && info->fallocend <= end && !unfalloc) info->fallocend = start; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, - &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + index, folio); continue; } - index += thp_nr_pages(page) - 1; + index += folio_nr_pages(folio) - 1; - if (!unfalloc || !PageUptodate(page)) - truncate_inode_page(mapping, page); - unlock_page(page); + if (!unfalloc || !folio_test_uptodate(folio)) + truncate_inode_folio(mapping, folio); + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } - if (partial_start) { - struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ); - if (page) { - unsigned int top = PAGE_SIZE; - if (start > end) { - top = partial_end; - partial_end = 0; - } - zero_user_segment(page, partial_start, top); - set_page_dirty(page); - unlock_page(page); - put_page(page); + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ); - if (page) { - zero_user_segment(page, 0, partial_end); - set_page_dirty(page); - unlock_page(page); - put_page(page); - } + + if (!same_folio) + folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); + if (folio) { + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - if (start >= end) - return; index = start; while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &pvec, + if (!find_get_entries(mapping, index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -1000,14 +995,14 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index = start; continue; } - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; - if (shmem_free_swap(mapping, index, page)) { + if (shmem_free_swap(mapping, index, folio)) { /* Swap was replaced by page: retry */ index--; break; @@ -1016,32 +1011,24 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, continue; } - lock_page(page); + folio_lock(folio); - if (!unfalloc || !PageUptodate(page)) { - if (page_mapping(page) != mapping) { + if (!unfalloc || !folio_test_uptodate(folio)) { + if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ - unlock_page(page); + folio_unlock(folio); index--; break; } - VM_BUG_ON_PAGE(PageWriteback(page), page); - if (shmem_punch_compound(page, start, end)) - truncate_inode_page(mapping, page); - else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - /* Wipe the page and don't get stuck */ - clear_highpage(page); - flush_dcache_page(page); - set_page_dirty(page); - if (index < - round_up(start, HPAGE_PMD_NR)) - start = index + 1; - } + VM_BUG_ON_FOLIO(folio_test_writeback(folio), + folio); + truncate_inode_folio(mapping, folio); } - unlock_page(page); + index = folio->index + folio_nr_pages(folio) - 1; + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); index++; } @@ -1559,8 +1546,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), - true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -2457,6 +2443,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | @@ -2467,7 +2454,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + + if (ret) + return ret; + + if (PageHWPoison(*pagep)) { + unlock_page(*pagep); + put_page(*pagep); + *pagep = NULL; + return -EIO; + } + + return 0; } static int @@ -2554,6 +2553,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (sgp == SGP_CACHE) set_page_dirty(page); unlock_page(page); + + if (PageHWPoison(page)) { + put_page(page); + error = -EIO; + break; + } } /* @@ -3093,7 +3098,8 @@ static const char *shmem_get_link(struct dentry *dentry, page = find_get_page(inode->i_mapping, 0); if (!page) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { + if (PageHWPoison(page) || + !PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } @@ -3101,6 +3107,13 @@ static const char *shmem_get_link(struct dentry *dentry, error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); + if (!page) + return ERR_PTR(-ECHILD); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ECHILD); + } unlock_page(page); } set_delayed_call(done, shmem_put_link, page); @@ -3751,6 +3764,13 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, @@ -3761,7 +3781,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; EXPORT_SYMBOL(shmem_aops); @@ -4169,9 +4189,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) - page = ERR_PTR(error); - else - unlock_page(page); + return ERR_PTR(error); + + unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; #else /* diff --git a/mm/slab.h b/mm/slab.h index 207658b200ef..c7f2abc2b154 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -788,11 +788,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif -void *slab_start(struct seq_file *m, loff_t *pos); -void *slab_next(struct seq_file *m, void *p, loff_t *pos); -void slab_stop(struct seq_file *m, void *p); -int memcg_slab_show(struct seq_file *m, void *p); - #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) void dump_unreclaimable_slab(void); #else diff --git a/mm/slab_common.c b/mm/slab_common.c index dc15566141d4..23f2ab0713b7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -489,9 +489,7 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - int err; - - if (unlikely(!s)) + if (unlikely(!s) || !kasan_check_byte(s)) return; cpus_read_lock(); @@ -501,12 +499,9 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - err = shutdown_cache(s); - if (err) { - pr_err("%s %s: Slab cache still has objects\n", - __func__, s->name); - dump_stack(); - } + WARN(shutdown_cache(s), + "%s %s: Slab cache still has objects when called from %pS", + __func__, s->name, (void *)_RET_IP_); out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); @@ -824,7 +819,7 @@ void __init setup_kmalloc_cache_index_table(void) if (KMALLOC_MIN_SIZE >= 64) { /* - * The 96 byte size cache is not used if the alignment + * The 96 byte sized cache is not used if the alignment * is 64 byte. */ for (i = 64 + 8; i <= 96; i += 8) @@ -849,7 +844,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) if (type == KMALLOC_RECLAIM) { flags |= SLAB_RECLAIM_ACCOUNT; } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { - if (cgroup_memory_nokmem) { + if (mem_cgroup_kmem_disabled()) { kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; return; } @@ -1044,18 +1039,18 @@ static void print_slabinfo_header(struct seq_file *m) seq_putc(m, '\n'); } -void *slab_start(struct seq_file *m, loff_t *pos) +static void *slab_start(struct seq_file *m, loff_t *pos) { mutex_lock(&slab_mutex); return seq_list_start(&slab_caches, *pos); } -void *slab_next(struct seq_file *m, void *p, loff_t *pos) +static void *slab_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &slab_caches, pos); } -void slab_stop(struct seq_file *m, void *p) +static void slab_stop(struct seq_file *m, void *p) { mutex_unlock(&slab_mutex); } @@ -1123,17 +1118,6 @@ void dump_unreclaimable_slab(void) mutex_unlock(&slab_mutex); } -#if defined(CONFIG_MEMCG_KMEM) -int memcg_slab_show(struct seq_file *m, void *p) -{ - /* - * Deprecated. - * Please, take a look at tools/cgroup/slabinfo.py . - */ - return 0; -} -#endif - /* * slabinfo_op - iterator that generates /proc/slabinfo * diff --git a/mm/swap.c b/mm/swap.c index e8c9dc6d0377..bcf3ac288b56 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -882,7 +882,7 @@ void lru_cache_disable(void) * all online CPUs so any calls of lru_cache_disabled wrapped by * local_lock or preemption disabled would be ordered by that. * The atomic operation doesn't need to have stronger ordering - * requirements because that is enforeced by the scheduling + * requirements because that is enforced by the scheduling * guarantees. */ __lru_add_drain_all(true); @@ -1077,24 +1077,24 @@ void __pagevec_lru_add(struct pagevec *pvec) } /** - * pagevec_remove_exceptionals - pagevec exceptionals pruning - * @pvec: The pagevec to prune + * folio_batch_remove_exceptionals() - Prune non-folios from a batch. + * @fbatch: The batch to prune * - * find_get_entries() fills both pages and XArray value entries (aka - * exceptional entries) into the pagevec. This function prunes all - * exceptionals from @pvec without leaving holes, so that it can be - * passed on to page-only pagevec operations. + * find_get_entries() fills a batch with both folios and shadow/swap/DAX + * entries. This function prunes all the non-folio entries from @fbatch + * without leaving holes, so that it can be passed on to folio-only batch + * operations. */ -void pagevec_remove_exceptionals(struct pagevec *pvec) +void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { - int i, j; + unsigned int i, j; - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!xa_is_value(page)) - pvec->pages[j++] = page; + for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + if (!xa_is_value(folio)) + fbatch->folios[j++] = folio; } - pvec->nr = j; + fbatch->nr = j; } /** diff --git a/mm/swapfile.c b/mm/swapfile.c index e59e08ef46e1..caa9f81a0d15 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1601,31 +1601,30 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, +static int page_trans_huge_map_swapcount(struct page *page, int *total_swapcount) { - int i, map_swapcount, _total_mapcount, _total_swapcount; + int i, map_swapcount, _total_swapcount; unsigned long offset = 0; struct swap_info_struct *si; struct swap_cluster_info *ci = NULL; unsigned char *map = NULL; - int mapcount, swapcount = 0; + int swapcount = 0; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; - return mapcount + swapcount; + return swapcount + page_trans_huge_mapcount(page); } page = compound_head(page); - _total_mapcount = _total_swapcount = map_swapcount = 0; + _total_swapcount = map_swapcount = 0; if (PageSwapCache(page)) { swp_entry_t entry; @@ -1639,8 +1638,7 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, if (map) ci = lock_cluster(si, offset); for (i = 0; i < HPAGE_PMD_NR; i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; - _total_mapcount += mapcount; + int mapcount = atomic_read(&page[i]._mapcount) + 1; if (map) { swapcount = swap_count(map[offset + i]); _total_swapcount += swapcount; @@ -1648,19 +1646,14 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, map_swapcount = max(map_swapcount, mapcount + swapcount); } unlock_cluster(ci); - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) map_swapcount -= 1; - _total_mapcount -= HPAGE_PMD_NR; - } - mapcount = compound_mapcount(page); - map_swapcount += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; + if (total_swapcount) *total_swapcount = _total_swapcount; - return map_swapcount; + return map_swapcount + compound_mapcount(page); } /* @@ -1668,22 +1661,15 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. - * - * NOTE: total_map_swapcount should not be relied upon by the caller if - * reuse_swap_page() returns false, but it may be always overwritten - * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_map_swapcount) +bool reuse_swap_page(struct page *page) { - int count, total_mapcount, total_swapcount; + int count, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_map_swapcount(page, &total_mapcount, - &total_swapcount); - if (total_map_swapcount) - *total_map_swapcount = total_mapcount + total_swapcount; + count = page_trans_huge_map_swapcount(page, &total_swapcount); if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ @@ -1917,14 +1903,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); out: pte_unmap_unlock(pte, ptl); diff --git a/mm/truncate.c b/mm/truncate.c index cc83a3f7c1ad..5e243d7269c0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -56,11 +56,11 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, /* * Unconditionally remove exceptional entries. Usually called from truncate - * path. Note that the pagevec may be altered by this function by removing - * exceptional entries similar to what pagevec_remove_exceptionals does. + * path. Note that the folio_batch may be altered by this function by removing + * exceptional entries similar to what folio_batch_remove_exceptionals() does. */ -static void truncate_exceptional_pvec_entries(struct address_space *mapping, - struct pagevec *pvec, pgoff_t *indices) +static void truncate_folio_batch_exceptionals(struct address_space *mapping, + struct folio_batch *fbatch, pgoff_t *indices) { int i, j; bool dax; @@ -69,11 +69,11 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, if (shmem_mapping(mapping)) return; - for (j = 0; j < pagevec_count(pvec); j++) - if (xa_is_value(pvec->pages[j])) + for (j = 0; j < folio_batch_count(fbatch); j++) + if (xa_is_value(fbatch->folios[j])) break; - if (j == pagevec_count(pvec)) + if (j == folio_batch_count(fbatch)) return; dax = dax_mapping(mapping); @@ -82,12 +82,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, xa_lock_irq(&mapping->i_pages); } - for (i = j; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; + for (i = j; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; pgoff_t index = indices[i]; - if (!xa_is_value(page)) { - pvec->pages[j++] = page; + if (!xa_is_value(folio)) { + fbatch->folios[j++] = folio; continue; } @@ -96,7 +96,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, continue; } - __clear_shadow_entry(mapping, index, page); + __clear_shadow_entry(mapping, index, folio); } if (!dax) { @@ -105,7 +105,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } - pvec->nr = j; + fbatch->nr = j; } /* @@ -177,21 +177,21 @@ void do_invalidatepage(struct page *page, unsigned int offset, * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void truncate_cleanup_page(struct page *page) +static void truncate_cleanup_folio(struct folio *folio) { - if (page_mapped(page)) - unmap_mapping_page(page); + if (folio_mapped(folio)) + unmap_mapping_folio(folio); - if (page_has_private(page)) - do_invalidatepage(page, 0, thp_size(page)); + if (folio_has_private(folio)) + do_invalidatepage(&folio->page, 0, folio_size(folio)); /* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ - cancel_dirty_page(page); - ClearPageMappedToDisk(page); + folio_cancel_dirty(folio); + folio_clear_mappedtodisk(folio); } /* @@ -205,7 +205,6 @@ static void truncate_cleanup_page(struct page *page) static int invalidate_complete_page(struct address_space *mapping, struct page *page) { - int ret; if (page->mapping != mapping) return 0; @@ -213,28 +212,78 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; - ret = remove_mapping(mapping, page); - - return ret; + return remove_mapping(mapping, page); } -int truncate_inode_page(struct address_space *mapping, struct page *page) +int truncate_inode_folio(struct address_space *mapping, struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); - - if (page->mapping != mapping) + if (folio->mapping != mapping) return -EIO; - truncate_cleanup_page(page); - delete_from_page_cache(page); + truncate_cleanup_folio(folio); + filemap_remove_folio(folio); return 0; } /* + * Handle partial folios. The folio may be entirely within the + * range if a split has raced with us. If not, we zero the part of the + * folio that's within the [start, end] range, and then split the folio if + * it's large. split_page_range() will discard pages which now lie beyond + * i_size, and we rely on the caller to discard pages which lie within a + * newly created hole. + * + * Returns false if splitting failed so the caller can avoid + * discarding the entire folio which is stubbornly unsplit. + */ +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) +{ + loff_t pos = folio_pos(folio); + unsigned int offset, length; + + if (pos < start) + offset = start - pos; + else + offset = 0; + length = folio_size(folio); + if (pos + length <= (u64)end) + length = length - offset; + else + length = end + 1 - pos - offset; + + folio_wait_writeback(folio); + if (length == folio_size(folio)) { + truncate_inode_folio(folio->mapping, folio); + return true; + } + + /* + * We may be zeroing pages we're about to discard, but it avoids + * doing a complex calculation here, and then doing the zeroing + * anyway if the page split fails. + */ + folio_zero_range(folio, offset, length); + + cleancache_invalidate_page(folio->mapping, &folio->page); + if (folio_has_private(folio)) + do_invalidatepage(&folio->page, offset, length); + if (!folio_test_large(folio)) + return true; + if (split_huge_page(&folio->page) == 0) + return true; + if (folio_test_dirty(folio)) + return false; + truncate_inode_folio(folio->mapping, folio); + return true; +} + +/* * Used to get rid of pages on hardware memory corruption. */ int generic_error_remove_page(struct address_space *mapping, struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); + if (!mapping) return -EINVAL; /* @@ -243,7 +292,7 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; - return truncate_inode_page(mapping, page); + return truncate_inode_folio(mapping, page_folio(page)); } EXPORT_SYMBOL(generic_error_remove_page); @@ -294,20 +343,16 @@ void truncate_inode_pages_range(struct address_space *mapping, { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ - unsigned int partial_start; /* inclusive */ - unsigned int partial_end; /* exclusive */ - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; + struct folio *folio; + bool same_folio; if (mapping_empty(mapping)) goto out; - /* Offsets within partial pages */ - partial_start = lstart & (PAGE_SIZE - 1); - partial_end = (lend + 1) & (PAGE_SIZE - 1); - /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the @@ -325,64 +370,49 @@ void truncate_inode_pages_range(struct address_space *mapping, else end = (lend + 1) >> PAGE_SHIFT; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, - &pvec, indices)) { - index = indices[pagevec_count(&pvec) - 1] + 1; - truncate_exceptional_pvec_entries(mapping, &pvec, indices); - for (i = 0; i < pagevec_count(&pvec); i++) - truncate_cleanup_page(pvec.pages[i]); - delete_from_page_cache_batch(mapping, &pvec); - for (i = 0; i < pagevec_count(&pvec); i++) - unlock_page(pvec.pages[i]); - pagevec_release(&pvec); + &fbatch, indices)) { + index = indices[folio_batch_count(&fbatch) - 1] + 1; + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + for (i = 0; i < folio_batch_count(&fbatch); i++) + truncate_cleanup_folio(fbatch.folios[i]); + delete_from_page_cache_batch(mapping, &fbatch); + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_unlock(fbatch.folios[i]); + folio_batch_release(&fbatch); cond_resched(); } - if (partial_start) { - struct page *page = find_lock_page(mapping, start - 1); - if (page) { - unsigned int top = PAGE_SIZE; - if (start > end) { - /* Truncation within a single page */ - top = partial_end; - partial_end = 0; - } - wait_on_page_writeback(page); - zero_user_segment(page, partial_start, top); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, partial_start, - top - partial_start); - unlock_page(page); - put_page(page); + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = find_lock_page(mapping, end); - if (page) { - wait_on_page_writeback(page); - zero_user_segment(page, 0, partial_end); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, 0, - partial_end); - unlock_page(page); - put_page(page); - } + + if (!same_folio) + folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, + FGP_LOCK, 0); + if (folio) { + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - /* - * If the truncation happened within a single page no pages - * will be released, just zeroed, so we can bail out now. - */ - if (start >= end) - goto out; index = start; - for ( ; ; ) { + while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &pvec, + if (!find_get_entries(mapping, index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) @@ -392,23 +422,24 @@ void truncate_inode_pages_range(struct address_space *mapping, continue; } - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - lock_page(page); - WARN_ON(page_to_index(page) != index); - wait_on_page_writeback(page); - truncate_inode_page(mapping, page); - unlock_page(page); + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + folio_wait_writeback(folio); + truncate_inode_folio(mapping, folio); + folio_unlock(folio); + index = folio_index(folio) + folio_nr_pages(folio) - 1; } - truncate_exceptional_pvec_entries(mapping, &pvec, indices); - pagevec_release(&pvec); + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + folio_batch_release(&fbatch); index++; } @@ -479,16 +510,16 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; - pagevec_init(&pvec); - while (find_lock_entries(mapping, index, end, &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + folio_batch_init(&fbatch); + while (find_lock_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct page *page = &fbatch.folios[i]->page; /* We rely upon deletion not changing page->index */ index = indices[i]; @@ -515,8 +546,8 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, } count += ret; } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } @@ -568,31 +599,29 @@ void invalidate_mapping_pagevec(struct address_space *mapping, * shrink_page_list() has a temp ref on them, or because they're transiently * sitting in the lru_cache_add() pagevecs. */ -static int -invalidate_complete_page2(struct address_space *mapping, struct page *page) +static int invalidate_complete_folio2(struct address_space *mapping, + struct folio *folio) { - if (page->mapping != mapping) + if (folio->mapping != mapping) return 0; - if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) + if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_KERNEL)) return 0; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - if (PageDirty(page)) + if (folio_test_dirty(folio)) goto failed; - BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL); + BUG_ON(folio_has_private(folio)); + __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - if (mapping->a_ops->freepage) - mapping->a_ops->freepage(page); - - put_page(page); /* pagecache ref */ + filemap_free_folio(mapping, folio); return 1; failed: xa_unlock_irq(&mapping->i_pages); @@ -600,13 +629,13 @@ failed: return 0; } -static int do_launder_page(struct address_space *mapping, struct page *page) +static int do_launder_folio(struct address_space *mapping, struct folio *folio) { - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) return 0; - if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL) return 0; - return mapping->a_ops->launder_page(page); + return mapping->a_ops->launder_page(&folio->page); } /** @@ -624,7 +653,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index; int i; int ret = 0; @@ -634,25 +663,25 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (mapping_empty(mapping)) goto out; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; - while (find_get_entries(mapping, index, end, &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + while (find_get_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; - /* We rely upon deletion not changing page->index */ + /* We rely upon deletion not changing folio->index */ index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (!invalidate_exceptional_entry2(mapping, - index, page)) + index, folio)) ret = -EBUSY; continue; } - if (!did_range_unmap && page_mapped(page)) { + if (!did_range_unmap && folio_mapped(folio)) { /* - * If page is mapped, before taking its lock, + * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ unmap_mapping_pages(mapping, index, @@ -660,29 +689,29 @@ int invalidate_inode_pages2_range(struct address_space *mapping, did_range_unmap = 1; } - lock_page(page); - WARN_ON(page_to_index(page) != index); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + if (folio->mapping != mapping) { + folio_unlock(folio); continue; } - wait_on_page_writeback(page); + folio_wait_writeback(folio); - if (page_mapped(page)) - unmap_mapping_page(page); - BUG_ON(page_mapped(page)); + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + BUG_ON(folio_mapped(folio)); - ret2 = do_launder_page(mapping, page); + ret2 = do_launder_folio(mapping, folio); if (ret2 == 0) { - if (!invalidate_complete_page2(mapping, page)) + if (!invalidate_complete_folio2(mapping, folio)) ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; - unlock_page(page); + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ac6f036298cd..0780c2a57ff1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -232,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, goto out; } + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; + } + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, page, false, wp_copy); if (ret) diff --git a/mm/util.c b/mm/util.c index 741ba32a43ac..7e43369064c8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -549,13 +549,10 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not - * fall back to vmalloc. - * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) @@ -564,13 +561,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) void *ret; /* - * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) - * so the given set of flags has to be compatible. - */ - if ((flags & GFP_KERNEL) != GFP_KERNEL) - return kmalloc_node(size, flags, node); - - /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore * contribute to a long term fragmentation less than vmalloc fallback. @@ -582,6 +572,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + kmalloc_flags &= ~__GFP_NOFAIL; } ret = kmalloc_node(size, kmalloc_flags, node); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d2a00ad4e1dd..4165304d3547 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,6 +31,7 @@ #include <linux/kmemleak.h> #include <linux/atomic.h> #include <linux/compiler.h> +#include <linux/memcontrol.h> #include <linux/llist.h> #include <linux/bitops.h> #include <linux/rbtree_augmented.h> @@ -38,6 +39,7 @@ #include <linux/pgtable.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <linux/sched/mm.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -2623,12 +2625,13 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { unsigned int page_order = vm_area_page_order(area); - int i; + int i, step = 1U << page_order; - for (i = 0; i < area->nr_pages; i += 1U << page_order) { + for (i = 0; i < area->nr_pages; i += step) { struct page *page = area->pages[i]; BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -step); __free_pages(page, page_order); cond_resched(); } @@ -2844,6 +2847,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { + gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; + while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -2861,12 +2866,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolcy want to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy(gfp, + nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node(gfp, nid, + nr = alloc_pages_bulk_array_node(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); @@ -2921,11 +2926,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t orig_gfp_mask = gfp_mask; + bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; + unsigned int flags; + int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; @@ -2955,6 +2963,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); + if (gfp_mask & __GFP_ACCOUNT) { + int i, step = 1U << page_order; + + for (i = 0; i < area->nr_pages; i += step) + mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, + step); + } /* * If not enough pages were obtained to accomplish an @@ -2967,8 +2982,28 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } - if (vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift) < 0) { + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + do { + ret = vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + + if (ret < 0) { warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); @@ -2996,12 +3031,14 @@ fail: * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp - * flags are not supported. GFP_KERNEL would be a preferred allocation mode - * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not - * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka - * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka - * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported). - * __GFP_NOWARN can be used to suppress error messages about failures. + * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all + * supported. + * Zone modifiers are not supported. From the reclaim modifiers + * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) + * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and + * __GFP_RETRY_MAYFAIL are not supported). + * + * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. @@ -3056,9 +3093,14 @@ again: VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { + bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, vm_struct allocation failed", - real_size); + "vmalloc error: size %lu, vm_struct allocation failed%s", + real_size, (nofail) ? ". Retrying." : ""); + if (nofail) { + schedule_timeout_uninterruptible(1); + goto again; + } goto fail; } @@ -3074,7 +3116,8 @@ again: clear_vm_uninitialized_flag(area); size = PAGE_ALIGN(size); - kmemleak_vmalloc(area, size, gfp_mask); + if (!(vm_flags & VM_DEFER_KMEMLEAK)) + kmemleak_vmalloc(area, size, gfp_mask); return addr; diff --git a/mm/vmscan.c b/mm/vmscan.c index fb9584641ac7..090bfb605ecf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -951,7 +951,7 @@ out: return freed; } -void drop_slab_node(int nid) +static void drop_slab_node(int nid) { unsigned long freed; int shift = 0; @@ -1021,6 +1021,39 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +static bool skip_throttle_noprogress(pg_data_t *pgdat) +{ + int reclaimable = 0, write_pending = 0; + int i; + + /* + * If kswapd is disabled, reschedule if necessary but do not + * throttle as the system is likely near OOM. + */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + + /* + * If there are a lot of dirty/writeback pages then do not + * throttle as throttling will occur when the pages cycle + * towards the end of the LRU if still under writeback. + */ + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + reclaimable += zone_reclaimable_pages(zone); + write_pending += zone_page_state_snapshot(zone, + NR_ZONE_WRITE_PENDING); + } + if (2 * write_pending <= reclaimable) + return true; + + return false; +} + void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) { wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; @@ -1056,8 +1089,16 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) } break; + case VMSCAN_THROTTLE_CONGESTED: + fallthrough; case VMSCAN_THROTTLE_NOPROGRESS: - timeout = HZ/2; + if (skip_throttle_noprogress(pgdat)) { + cond_resched(); + return; + } + + timeout = 1; + break; case VMSCAN_THROTTLE_ISOLATED: timeout = HZ/50; @@ -3321,7 +3362,7 @@ again: if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc)) @@ -3386,16 +3427,16 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) } /* - * Do not throttle kswapd on NOPROGRESS as it will throttle on - * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under - * writeback and marked for immediate reclaim at the tail of - * the LRU. + * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will + * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages + * under writeback and marked for immediate reclaim at the tail of the + * LRU. */ - if (current_is_kswapd()) + if (current_is_kswapd() || cgroup_reclaim(sc)) return; /* Throttle if making no progress at high prioities. */ - if (sc->priority < DEF_PRIORITY - 2) + if (sc->priority == 1 && !sc->nr_reclaimed) reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); } @@ -3415,6 +3456,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_scanned; gfp_t orig_mask; pg_data_t *last_pgdat = NULL; + pg_data_t *first_pgdat = NULL; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -3478,14 +3520,19 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } + if (!first_pgdat) + first_pgdat = zone->zone_pgdat; + /* See comment about same check for global reclaim above */ if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); - consider_reclaim_throttle(zone->zone_pgdat, sc); } + if (first_pgdat) + consider_reclaim_throttle(first_pgdat, sc); + /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. diff --git a/mm/vmstat.c b/mm/vmstat.c index d701c335628c..4057372745d0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1353,6 +1353,9 @@ const char * const vmstat_text[] = { "thp_split_page_failed", "thp_deferred_split_page", "thp_split_pmd", + "thp_scan_exceed_none_pte", + "thp_scan_exceed_swap_pte", + "thp_scan_exceed_share_pte", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", #endif diff --git a/mm/zpool.c b/mm/zpool.c index 6d9ed48141e5..68facc193496 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -24,16 +24,11 @@ struct zpool { const struct zpool_ops *ops; bool evictable; bool can_sleep_mapped; - - struct list_head list; }; static LIST_HEAD(drivers_head); static DEFINE_SPINLOCK(drivers_lock); -static LIST_HEAD(pools_head); -static DEFINE_SPINLOCK(pools_lock); - /** * zpool_register_driver() - register a zpool implementation. * @driver: driver to register @@ -195,10 +190,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, pr_debug("created pool type %s\n", type); - spin_lock(&pools_lock); - list_add(&zpool->list, &pools_head); - spin_unlock(&pools_lock); - return zpool; } @@ -217,9 +208,6 @@ void zpool_destroy_pool(struct zpool *zpool) { pr_debug("destroying pool type %s\n", zpool->driver->type); - spin_lock(&pools_lock); - list_del(&zpool->list); - spin_unlock(&pools_lock); zpool->driver->destroy(zpool->pool); zpool_put_driver(zpool->driver); kfree(zpool); |