summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-02-25 03:20:38 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-02-25 03:20:38 +0300
commit4c48faba5b7f18fb53e4aeeb768932f17c9da1ed (patch)
tree9d9d2ca881a670b5df55a663ee506f212c9514c3
parent062c84fccc4444805738d76a2699c4d3c95184ec (diff)
parenta553e3cd2053501b658feec2be9a3b662eb1b22b (diff)
downloadlinux-4c48faba5b7f18fb53e4aeeb768932f17c9da1ed.tar.xz
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "A few small subsystems and some of MM. 172 patches. Subsystems affected by this patch series: hexagon, scripts, ntfs, ocfs2, vfs, and mm (slab-generic, slab, slub, debug, pagecache, swap, memcg, pagemap, mprotect, mremap, page-reporting, vmalloc, kasan, pagealloc, memory-failure, hugetlb, vmscan, z3fold, compaction, mempolicy, oom-kill, hugetlbfs, and migration)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (172 commits) mm/migrate: remove unneeded semicolons hugetlbfs: remove unneeded return value of hugetlb_vmtruncate() hugetlbfs: fix some comment typos hugetlbfs: correct some obsolete comments about inode i_mutex hugetlbfs: make hugepage size conversion more readable hugetlbfs: remove meaningless variable avoid_reserve hugetlbfs: correct obsolete function name in hugetlbfs_read_iter() hugetlbfs: use helper macro default_hstate in init_hugetlbfs_fs hugetlbfs: remove useless BUG_ON(!inode) in hugetlbfs_setattr() hugetlbfs: remove special hugetlbfs_set_page_dirty() mm/hugetlb: change hugetlb_reserve_pages() to type bool mm, oom: fix a comment in dump_task() mm/mempolicy: use helper range_in_vma() in queue_pages_test_walk() numa balancing: migrate on fault among multiple bound nodes mm, compaction: make fast_isolate_freepages() stay within zone mm/compaction: fix misbehaviors of fast_find_migrateblock() mm/compaction: correct deferral logic for proactive compaction mm/compaction: remove duplicated VM_BUG_ON_PAGE !PageLocked mm/compaction: remove rcu_read_lock during page compaction z3fold: simplify the zhdr initialization code in init_z3fold_page() ...
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst4
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt8
-rw-r--r--Documentation/admin-guide/sysctl/vm.rst10
-rw-r--r--Documentation/core-api/mm-api.rst7
-rw-r--r--Documentation/dev-tools/kasan.rst24
-rw-r--r--Documentation/vm/arch_pgtable_helpers.rst8
-rw-r--r--arch/arm64/include/asm/memory.h1
-rw-r--r--arch/arm64/include/asm/mte-kasan.h12
-rw-r--r--arch/arm64/kernel/mte.c12
-rw-r--r--arch/arm64/mm/fault.c20
-rw-r--r--arch/hexagon/configs/comet_defconfig1
-rw-r--r--arch/ia64/include/asm/pgtable.h6
-rw-r--r--arch/ia64/mm/init.c14
-rw-r--r--arch/mips/mm/pgtable-32.c1
-rw-r--r--arch/mips/mm/pgtable-64.c1
-rw-r--r--drivers/base/node.c33
-rw-r--r--drivers/video/fbdev/acornfb.c34
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/buffer.c7
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/exec.c4
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/hugetlbfs/inode.c72
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ntfs/layout.h4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c8
-rw-r--r--fs/ocfs2/dlm/dlmast.c10
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/meminfo.c10
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/ramfs/inode.c13
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/gfp.h14
-rw-r--r--include/linux/highmem-internal.h5
-rw-r--r--include/linux/huge_mm.h15
-rw-r--r--include/linux/hugetlb.h98
-rw-r--r--include/linux/kasan-checks.h6
-rw-r--r--include/linux/kasan.h37
-rw-r--r--include/linux/memcontrol.h43
-rw-r--r--include/linux/migrate.h2
-rw-r--r--include/linux/mm.h22
-rw-r--r--include/linux/mm_inline.h113
-rw-r--r--include/linux/mmzone.h22
-rw-r--r--include/linux/page-flags.h6
-rw-r--r--include/linux/page_counter.h9
-rw-r--r--include/linux/pagemap.h5
-rw-r--r--include/linux/swap.h8
-rw-r--r--include/trace/events/kmem.h24
-rw-r--r--include/trace/events/pagemap.h11
-rw-r--r--include/uapi/linux/mempolicy.h4
-rw-r--r--init/Kconfig14
-rw-r--r--lib/Kconfig.kasan6
-rw-r--r--lib/Makefile2
-rw-r--r--lib/test_kasan.c424
-rw-r--r--lib/test_kasan_module.c5
-rw-r--r--mm/backing-dev.c6
-rw-r--r--mm/compaction.c73
-rw-r--r--mm/debug.c10
-rw-r--r--mm/debug_vm_pgtable.c86
-rw-r--r--mm/filemap.c583
-rw-r--r--mm/gup.c5
-rw-r--r--mm/huge_memory.c28
-rw-r--r--mm/hugetlb.c346
-rw-r--r--mm/hugetlb_cgroup.c6
-rw-r--r--mm/kasan/common.c56
-rw-r--r--mm/kasan/generic.c38
-rw-r--r--mm/kasan/hw_tags.c16
-rw-r--r--mm/kasan/kasan.h81
-rw-r--r--mm/kasan/quarantine.c22
-rw-r--r--mm/kasan/report.c15
-rw-r--r--mm/kasan/report_generic.c8
-rw-r--r--mm/kasan/report_hw_tags.c8
-rw-r--r--mm/kasan/report_sw_tags.c8
-rw-r--r--mm/kasan/shadow.c27
-rw-r--r--mm/kasan/sw_tags.c20
-rw-r--r--mm/khugepaged.c6
-rw-r--r--mm/list_lru.c12
-rw-r--r--mm/memcontrol.c279
-rw-r--r--mm/memory-failure.c34
-rw-r--r--mm/memory.c24
-rw-r--r--mm/memory_hotplug.c11
-rw-r--r--mm/mempolicy.c18
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page_alloc.c64
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/page_reporting.c2
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/rmap.c35
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c21
-rw-r--r--mm/slab.h20
-rw-r--r--mm/slab_common.c40
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c169
-rw-r--r--mm/swap.c50
-rw-r--r--mm/swap_slots.c3
-rw-r--r--mm/swap_state.c31
-rw-r--r--mm/swapfile.c8
-rw-r--r--mm/vmscan.c98
-rw-r--r--mm/vmstat.c14
-rw-r--r--mm/workingset.c7
-rw-r--r--mm/z3fold.c11
-rw-r--r--scripts/spelling.txt30
-rw-r--r--tools/objtool/check.c2
117 files changed, 2027 insertions, 1734 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index c513eafaddea..9acc57f68f31 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1299,6 +1299,10 @@ PAGE_SIZE multiple when read back.
Amount of cached filesystem data that was modified and
is currently being written back to disk
+ swapcached
+ Amount of swap cached in memory. The swapcache is accounted
+ against both memory and swap usage.
+
anon_thp
Amount of memory used in anonymous mappings backed by
transparent hugepages
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 186ad235bf32..bab6a8b01202 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4904,14 +4904,6 @@
last alloc / free. For more information see
Documentation/vm/slub.rst.
- slub_memcg_sysfs= [MM, SLUB]
- Determines whether to enable sysfs directories for
- memory cgroup sub-caches. 1 to enable, 0 to disable.
- The default is determined by CONFIG_SLUB_MEMCG_SYSFS_ON.
- Enabling this can lead to a very high number of debug
- directories and files being created under
- /sys/kernel/slub.
-
slub_max_order= [MM, SLUB]
Determines the maximum allowed order for slabs.
A high setting may cause OOMs due to memory
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index e35a3f2fb006..586cd4b86428 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -983,11 +983,11 @@ that benefit from having their data cached, zone_reclaim_mode should be
left disabled as the caching effect is likely to be more important than
data locality.
-zone_reclaim may be enabled if it's known that the workload is partitioned
-such that each partition fits within a NUMA node and that accessing remote
-memory would cause a measurable performance reduction. The page allocator
-will then reclaim easily reusable pages (those page cache pages that are
-currently not used) before allocating off node pages.
+Consider enabling one or more zone_reclaim mode bits if it's known that the
+workload is partitioned such that each partition fits within a NUMA node
+and that accessing remote memory would cause a measurable performance
+reduction. The page allocator will take additional actions before
+allocating off node pages.
Allowing zone reclaim to write out pages stops processes that are
writing large amounts of data from dirtying pages on other nodes. Zone
diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst
index 2adffb3f7914..201b5423303b 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -19,11 +19,8 @@ User Space Memory Access
Memory Allocation Controls
==========================
-Functions which need to allocate memory often use GFP flags to express
-how that memory should be allocated. The GFP acronym stands for "get
-free pages", the underlying memory allocation function. Not every GFP
-flag is allowed to every function which may allocate memory. Most
-users will want to use a plain ``GFP_KERNEL``.
+.. kernel-doc:: include/linux/gfp.h
+ :internal:
.. kernel-doc:: include/linux/gfp.h
:doc: Page mobility and placement hints
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index a248ac3941be..cde14aeefca7 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -147,15 +147,14 @@ negative values to distinguish between different kinds of inaccessible memory
like redzones or freed memory (see mm/kasan/kasan.h).
In the report above the arrows point to the shadow byte 03, which means that
-the accessed address is partially accessible.
-
-For tag-based KASAN this last report section shows the memory tags around the
-accessed address (see `Implementation details`_ section).
+the accessed address is partially accessible. For tag-based KASAN modes this
+last report section shows the memory tags around the accessed address
+(see the `Implementation details`_ section).
Boot parameters
~~~~~~~~~~~~~~~
-Hardware tag-based KASAN mode (see the section about different mode below) is
+Hardware tag-based KASAN mode (see the section about various modes below) is
intended for use in production as a security mitigation. Therefore it supports
boot parameters that allow to disable KASAN competely or otherwise control
particular KASAN features.
@@ -289,6 +288,13 @@ reserved to tag freed memory regions.
Hardware tag-based KASAN currently only supports tagging of
kmem_cache_alloc/kmalloc and page_alloc memory.
+If the hardware doesn't support MTE (pre ARMv8.5), hardware tag-based KASAN
+won't be enabled. In this case all boot parameters are ignored.
+
+Note, that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being
+enabled. Even when kasan.mode=off is provided, or when the hardware doesn't
+support MTE (but supports TBI).
+
What memory accesses are sanitised by KASAN?
--------------------------------------------
@@ -352,17 +358,17 @@ unmapped. This will require changes in arch-specific code.
This allows ``VMAP_STACK`` support on x86, and can simplify support of
architectures that do not have a fixed module region.
-CONFIG_KASAN_KUNIT_TEST & CONFIG_TEST_KASAN_MODULE
---------------------------------------------------
+CONFIG_KASAN_KUNIT_TEST and CONFIG_KASAN_MODULE_TEST
+----------------------------------------------------
-KASAN tests consist on two parts:
+KASAN tests consist of two parts:
1. Tests that are integrated with the KUnit Test Framework. Enabled with
``CONFIG_KASAN_KUNIT_TEST``. These tests can be run and partially verified
automatically in a few different ways, see the instructions below.
2. Tests that are currently incompatible with KUnit. Enabled with
-``CONFIG_TEST_KASAN_MODULE`` and can only be run as a module. These tests can
+``CONFIG_KASAN_MODULE_TEST`` and can only be run as a module. These tests can
only be verified manually, by loading the kernel module and inspecting the
kernel log for KASAN reports.
diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst
index f3591ee3aaa8..552567d863b8 100644
--- a/Documentation/vm/arch_pgtable_helpers.rst
+++ b/Documentation/vm/arch_pgtable_helpers.rst
@@ -50,7 +50,7 @@ PTE Page Table Helpers
+---------------------------+--------------------------------------------------+
| pte_mkwrite | Creates a writable PTE |
+---------------------------+--------------------------------------------------+
-| pte_mkwrprotect | Creates a write protected PTE |
+| pte_wrprotect | Creates a write protected PTE |
+---------------------------+--------------------------------------------------+
| pte_mkspecial | Creates a special PTE |
+---------------------------+--------------------------------------------------+
@@ -120,7 +120,7 @@ PMD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pmd_mkwrite | Creates a writable PMD |
+---------------------------+--------------------------------------------------+
-| pmd_mkwrprotect | Creates a write protected PMD |
+| pmd_wrprotect | Creates a write protected PMD |
+---------------------------+--------------------------------------------------+
| pmd_mkspecial | Creates a special PMD |
+---------------------------+--------------------------------------------------+
@@ -186,7 +186,7 @@ PUD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pud_mkwrite | Creates a writable PUD |
+---------------------------+--------------------------------------------------+
-| pud_mkwrprotect | Creates a write protected PUD |
+| pud_wrprotect | Creates a write protected PUD |
+---------------------------+--------------------------------------------------+
| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD |
+---------------------------+--------------------------------------------------+
@@ -224,7 +224,7 @@ HugeTLB Page Table Helpers
+---------------------------+--------------------------------------------------+
| huge_pte_mkwrite | Creates a writable HugeTLB |
+---------------------------+--------------------------------------------------+
-| huge_pte_mkwrprotect | Creates a write protected HugeTLB |
+| huge_pte_wrprotect | Creates a write protected HugeTLB |
+---------------------------+--------------------------------------------------+
| huge_ptep_get_and_clear | Clears a HugeTLB |
+---------------------------+--------------------------------------------------+
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index bc09af26c1b8..c759faf7a1ff 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -244,6 +244,7 @@ static inline const void *__tag_set(const void *addr, u8 tag)
#ifdef CONFIG_KASAN_HW_TAGS
#define arch_enable_tagging() mte_enable_kernel()
+#define arch_set_tagging_report_once(state) mte_set_report_once(state)
#define arch_init_tags(max_tag) mte_init_tags(max_tag)
#define arch_get_random_tag() mte_get_random_tag()
#define arch_get_mem_tag(addr) mte_get_mem_tag(addr)
diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index 26349a4b5e2e..3748d5bb88c0 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -32,6 +32,9 @@ void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag);
void mte_enable_kernel(void);
void mte_init_tags(u64 max_tag);
+void mte_set_report_once(bool state);
+bool mte_report_once(void);
+
#else /* CONFIG_ARM64_MTE */
static inline u8 mte_get_ptr_tag(void *ptr)
@@ -60,6 +63,15 @@ static inline void mte_init_tags(u64 max_tag)
{
}
+static inline void mte_set_report_once(bool state)
+{
+}
+
+static inline bool mte_report_once(void)
+{
+ return false;
+}
+
#endif /* CONFIG_ARM64_MTE */
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 80b62fe49dcf..2cfc850809ce 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -25,6 +25,8 @@
u64 gcr_kernel_excl __ro_after_init;
+static bool report_fault_once = true;
+
static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
{
pte_t old_pte = READ_ONCE(*ptep);
@@ -158,6 +160,16 @@ void mte_enable_kernel(void)
isb();
}
+void mte_set_report_once(bool state)
+{
+ WRITE_ONCE(report_fault_once, state);
+}
+
+bool mte_report_once(void)
+{
+ return READ_ONCE(report_fault_once);
+}
+
static void update_sctlr_el1_tcf0(u64 tcf0)
{
/* ISB required for the kernel uaccess routines */
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 2e339f0bd958..dc9f96442edc 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -302,12 +302,24 @@ static void die_kernel_fault(const char *msg, unsigned long addr,
static void report_tag_fault(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
- bool is_write = ((esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT) != 0;
+ static bool reported;
+ bool is_write;
+
+ if (READ_ONCE(reported))
+ return;
+
+ /*
+ * This is used for KASAN tests and assumes that no MTE faults
+ * happened before running the tests.
+ */
+ if (mte_report_once())
+ WRITE_ONCE(reported, true);
/*
* SAS bits aren't set for all faults reported in EL1, so we can't
* find out access size.
*/
+ is_write = !!(esr & ESR_ELx_WNR);
kasan_report(addr, 0, is_write, regs->pc);
}
#else
@@ -319,12 +331,8 @@ static inline void report_tag_fault(unsigned long addr, unsigned int esr,
static void do_tag_recovery(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
- static bool reported;
- if (!READ_ONCE(reported)) {
- report_tag_fault(addr, esr, regs);
- WRITE_ONCE(reported, true);
- }
+ report_tag_fault(addr, esr, regs);
/*
* Disable MTE Tag Checking on the local CPU for the current EL.
diff --git a/arch/hexagon/configs/comet_defconfig b/arch/hexagon/configs/comet_defconfig
index e324f65f41e7..f19ae2ab0aaa 100644
--- a/arch/hexagon/configs/comet_defconfig
+++ b/arch/hexagon/configs/comet_defconfig
@@ -1,7 +1,6 @@
CONFIG_SMP=y
CONFIG_DEFAULT_MMAP_MIN_ADDR=0
CONFIG_HZ_100=y
-CONFIG_EXPERIMENTAL=y
CONFIG_CROSS_COMPILE="hexagon-"
CONFIG_LOCALVERSION="-smp"
# CONFIG_LOCALVERSION_AUTO is not set
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 779b6972aa84..9b4efe89e62d 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -517,12 +517,6 @@ extern struct page *zero_page_memmap_ptr;
__changed; \
})
#endif
-
-# ifdef CONFIG_VIRTUAL_MEM_MAP
- /* arch mem_map init routine is needed due to holes in a virtual mem_map */
- extern void memmap_init (unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn);
-# endif /* CONFIG_VIRTUAL_MEM_MAP */
# endif /* !__ASSEMBLY__ */
/*
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b19f47a5a305..16d0d7d22657 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -536,18 +536,20 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
/ sizeof(struct page));
if (map_start < map_end)
- memmap_init_zone((unsigned long)(map_end - map_start),
+ memmap_init_range((unsigned long)(map_end - map_start),
args->nid, args->zone, page_to_pfn(map_start), page_to_pfn(map_end),
MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
return 0;
}
-void __meminit
-memmap_init (unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn)
+void __meminit memmap_init_zone(struct zone *zone)
{
+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long size = zone->spanned_pages;
+
if (!vmem_map) {
- memmap_init_zone(size, nid, zone, start_pfn, start_pfn + size,
+ memmap_init_range(size, nid, zone_id, start_pfn, start_pfn + size,
MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
} else {
struct page *start;
@@ -557,7 +559,7 @@ memmap_init (unsigned long size, int nid, unsigned long zone,
args.start = start;
args.end = start + size;
args.nid = nid;
- args.zone = zone;
+ args.zone = zone_id;
efi_memmap_walk(virtual_memmap_init, &args);
}
diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c
index bd4b0656add3..61891af25019 100644
--- a/arch/mips/mm/pgtable-32.c
+++ b/arch/mips/mm/pgtable-32.c
@@ -45,7 +45,6 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
*pmdp = pmd;
- flush_tlb_all();
}
#endif /* defined(CONFIG_TRANSPARENT_HUGEPAGE) */
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c
index 183ff9f9c026..7536f7804c44 100644
--- a/arch/mips/mm/pgtable-64.c
+++ b/arch/mips/mm/pgtable-64.c
@@ -100,7 +100,6 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
*pmdp = pmd;
- flush_tlb_all();
}
void __init pagetable_init(void)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 04f71c7bc3f8..f449dbb2c746 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -372,14 +372,19 @@ static ssize_t node_read_meminfo(struct device *dev,
struct pglist_data *pgdat = NODE_DATA(nid);
struct sysinfo i;
unsigned long sreclaimable, sunreclaimable;
+ unsigned long swapcached = 0;
si_meminfo_node(&i, nid);
sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
+#ifdef CONFIG_SWAP
+ swapcached = node_page_state_pages(pgdat, NR_SWAPCACHE);
+#endif
len = sysfs_emit_at(buf, len,
"Node %d MemTotal: %8lu kB\n"
"Node %d MemFree: %8lu kB\n"
"Node %d MemUsed: %8lu kB\n"
+ "Node %d SwapCached: %8lu kB\n"
"Node %d Active: %8lu kB\n"
"Node %d Inactive: %8lu kB\n"
"Node %d Active(anon): %8lu kB\n"
@@ -391,6 +396,7 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(i.totalram),
nid, K(i.freeram),
nid, K(i.totalram - i.freeram),
+ nid, K(swapcached),
nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
node_page_state(pgdat, NR_ACTIVE_FILE)),
nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
@@ -461,16 +467,11 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(sunreclaimable)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
,
- nid, K(node_page_state(pgdat, NR_ANON_THPS) *
- HPAGE_PMD_NR),
- nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
- HPAGE_PMD_NR),
- nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
- HPAGE_PMD_NR),
- nid, K(node_page_state(pgdat, NR_FILE_THPS) *
- HPAGE_PMD_NR),
- nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
- HPAGE_PMD_NR)
+ nid, K(node_page_state(pgdat, NR_ANON_THPS)),
+ nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
+ nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
+ nid, K(node_page_state(pgdat, NR_FILE_THPS)),
+ nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))
#endif
);
len += hugetlb_report_node_meminfo(buf, len, nid);
@@ -519,10 +520,14 @@ static ssize_t node_read_vmstat(struct device *dev,
sum_zone_numa_state(nid, i));
#endif
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- len += sysfs_emit_at(buf, len, "%s %lu\n",
- node_stat_name(i),
- node_page_state_pages(pgdat, i));
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ unsigned long pages = node_page_state_pages(pgdat, i);
+
+ if (vmstat_item_print_in_thp(i))
+ pages /= HPAGE_PMD_NR;
+ len += sysfs_emit_at(buf, len, "%s %lu\n", node_stat_name(i),
+ pages);
+ }
return len;
}
diff --git a/drivers/video/fbdev/acornfb.c b/drivers/video/fbdev/acornfb.c
index bcc92aecf666..1b72edc01cfb 100644
--- a/drivers/video/fbdev/acornfb.c
+++ b/drivers/video/fbdev/acornfb.c
@@ -921,40 +921,6 @@ static int acornfb_detect_monitortype(void)
return 4;
}
-/*
- * This enables the unused memory to be freed on older Acorn machines.
- * We are freeing memory on behalf of the architecture initialisation
- * code here.
- */
-static inline void
-free_unused_pages(unsigned int virtual_start, unsigned int virtual_end)
-{
- int mb_freed = 0;
-
- /*
- * Align addresses
- */
- virtual_start = PAGE_ALIGN(virtual_start);
- virtual_end = PAGE_ALIGN(virtual_end);
-
- while (virtual_start < virtual_end) {
- struct page *page;
-
- /*
- * Clear page reserved bit,
- * set count to 1, and free
- * the page.
- */
- page = virt_to_page(virtual_start);
- __free_reserved_page(page);
-
- virtual_start += PAGE_SIZE;
- mb_freed += PAGE_SIZE / 1024;
- }
-
- printk("acornfb: freed %dK memory\n", mb_freed);
-}
-
static int acornfb_probe(struct platform_device *dev)
{
unsigned long size;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ec26179c8062..91ff9bfd7c1a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1270,7 +1270,7 @@ rescan:
return ret;
}
/*
- * Only exported for for loop and dasd for historic reasons. Don't use in new
+ * Only exported for loop and dasd for historic reasons. Don't use in new
* code!
*/
EXPORT_SYMBOL_GPL(bdev_disk_changed);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index be9e3900cce8..bf2c51a9607a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3634,7 +3634,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
- return generic_file_buffered_read(iocb, to, ret);
+ return filemap_read(iocb, to, ret);
}
const struct file_operations btrfs_file_operations = {
diff --git a/fs/buffer.c b/fs/buffer.c
index 32647d2011df..0cb7ffd4977c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -847,7 +847,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
if (retry)
gfp |= __GFP_NOFAIL;
- memcg = get_mem_cgroup_from_page(page);
+ /* The page lock pins the memcg */
+ memcg = page_memcg(page);
old_memcg = set_active_memcg(memcg);
head = NULL;
@@ -868,7 +869,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
}
out:
set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
return head;
/*
* In case anything failed, we just free everything we got.
@@ -2083,7 +2083,8 @@ static int __block_commit_write(struct inode *inode, struct page *page,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
- clear_buffer_new(bh);
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
block_start = block_end;
bh = bh->b_this_page;
diff --git a/fs/dcache.c b/fs/dcache.c
index 799d9e4f0bcd..7d24ff7eb206 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2176,8 +2176,8 @@ EXPORT_SYMBOL(d_obtain_root);
* same inode, only the actual correct case is stored in the dcache for
* case-insensitive filesystems.
*
- * For a case-insensitive lookup match and if the the case-exact dentry
- * already exists in in the dcache, use it and return it.
+ * For a case-insensitive lookup match and if the case-exact dentry
+ * already exists in the dcache, use it and return it.
*
* If no entry exists with the exact case name, allocate new dentry with
* the exact case, and return the spliced entry.
diff --git a/fs/direct-io.c b/fs/direct-io.c
index aa1083ecd623..0957e1bb8eb2 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -462,7 +462,7 @@ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
* Wait for the next BIO to complete. Remove it and return it. NULL is
* returned once all BIOs have been completed. This must only be called once
* all bios have been issued so that dio->refcount can only decrease. This
- * requires that that the caller hold a reference on the dio.
+ * requires that the caller hold a reference on the dio.
*/
static struct bio *dio_await_one(struct dio *dio)
{
@@ -1279,7 +1279,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
if (retval == -ENOTBLK) {
/*
* The remaining part of the request will be
- * be handled by buffered I/O when we return
+ * handled by buffered I/O when we return
*/
retval = 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index 6f3c02066ce3..18594f11c31f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1455,7 +1455,7 @@ EXPORT_SYMBOL(finalize_exec);
/*
* Prepare credentials and lock ->cred_guard_mutex.
* setup_new_exec() commits the new creds and drops the lock.
- * Or, if exec fails before, free_bprm() should release ->cred and
+ * Or, if exec fails before, free_bprm() should release ->cred
* and unlock.
*/
static int prepare_bprm_creds(struct linux_binprm *bprm)
@@ -1841,7 +1841,7 @@ static int bprm_execve(struct linux_binprm *bprm,
out:
/*
- * If past the point of no return ensure the the code never
+ * If past the point of no return ensure the code never
* returns to the userspace process. Use an existing fatal
* signal if present otherwise terminate the process with
* SIGSEGV.
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 01263ffbc4c0..ec6feeccc276 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -173,7 +173,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
/*
* With handle we don't look at the execute bit on the
- * the directory. Ideally we would like CAP_DAC_SEARCH.
+ * directory. Ideally we would like CAP_DAC_SEARCH.
* But we don't have that
*/
if (!capable(CAP_DAC_READ_SEARCH)) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 588f8d1240aa..c6636b4c4ccf 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -844,11 +844,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (WARN_ON(PageMlocked(oldpage)))
goto out_fallback_unlock;
- err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
- if (err) {
- unlock_page(newpage);
- goto out_put_old;
- }
+ replace_page_cache_page(oldpage, newpage);
get_page(newpage);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b7a72f577aab..701c82c36138 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -171,7 +171,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
ret = -ENOMEM;
- if (hugetlb_reserve_pages(inode,
+ if (!hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
vma->vm_flags))
@@ -310,7 +310,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
/*
* Support for read() - Find the page attached to f_mapping and copy out the
- * data. Its *very* similar to do_generic_mapping_read(), we can't use that
+ * data. Its *very* similar to generic_file_buffered_read(), we can't use that
* since it has PAGE_SIZE assumptions.
*/
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -442,15 +442,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
*
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
- * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
* maps and global counts. Page faults can not race with truncation
* in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents
* page faults in the truncated range by checking i_size. i_size is
* modified while holding i_mmap_rwsem.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
- * Only when releasing a page is the associated region/reserv map
- * deleted. The region/reserv map for ranges without associated
+ * Only when releasing a page is the associated region/reserve map
+ * deleted. The region/reserve map for ranges without associated
* pages are not modified. Page faults can race with hole punch.
* This is indicated if we find a mapped page.
* Note: If the passed end of range value is beyond the end of file, but
@@ -567,7 +567,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
+static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
struct address_space *mapping = inode->i_mapping;
@@ -582,7 +582,6 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
- return 0;
}
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
@@ -604,7 +603,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
inode_unlock(inode);
return -EPERM;
@@ -680,7 +679,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
*/
struct page *page;
unsigned long addr;
- int avoid_reserve = 0;
cond_resched();
@@ -716,8 +714,15 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
continue;
}
- /* Allocate page and add to page cache */
- page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ /*
+ * Allocate page without setting the avoid_reserve argument.
+ * There certainly are no reserves associated with the
+ * pseudo_vma. However, there could be shared mappings with
+ * reserves for the file at the inode level. If we fallocate
+ * pages in these areas, we need to consume the reserves
+ * to keep reservation accounting consistent.
+ */
+ page = alloc_huge_page(&pseudo_vma, addr, 0);
hugetlb_drop_vma_policy(&pseudo_vma);
if (IS_ERR(page)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -735,7 +740,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- set_page_huge_active(page);
+ SetHPageMigratable(page);
/*
* unlock_page because locked by add_to_page_cache()
* put_page() due to reference from alloc_huge_page()
@@ -761,8 +766,6 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
unsigned int ia_valid = attr->ia_valid;
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
- BUG_ON(!inode);
-
error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
return error;
@@ -773,13 +776,11 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
if (newsize & ~huge_page_mask(h))
return -EINVAL;
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
- error = hugetlb_vmtruncate(inode, newsize);
- if (error)
- return error;
+ hugetlb_vmtruncate(inode, newsize);
}
setattr_copy(&init_user_ns, inode, attr);
@@ -952,17 +953,6 @@ static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
return error;
}
-/*
- * mark the head page dirty
- */
-static int hugetlbfs_set_page_dirty(struct page *page)
-{
- struct page *head = compound_head(page);
-
- SetPageDirty(head);
- return 0;
-}
-
static int hugetlbfs_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
@@ -973,15 +963,9 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- /*
- * page_private is subpool pointer in hugetlb pages. Transfer to
- * new page. PagePrivate is not associated with page_private for
- * hugetlb pages and can not be set here as only page_huge_active
- * pages can be migrated.
- */
- if (page_private(page)) {
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
+ if (hugetlb_page_subpool(page)) {
+ hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page));
+ hugetlb_set_page_subpool(page, NULL);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1156,7 +1140,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
- .set_page_dirty = hugetlbfs_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
.migratepage = hugetlbfs_migrate_page,
.error_remove_page = hugetlbfs_error_remove_page,
};
@@ -1356,7 +1340,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
/*
* Allocate and initialize subpool if maximum or minimum size is
- * specified. Any needed reservations (for minimim size) are taken
+ * specified. Any needed reservations (for minimum size) are taken
* taken when the subpool is created.
*/
if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
@@ -1499,7 +1483,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
inode->i_size = size;
clear_nlink(inode);
- if (hugetlb_reserve_pages(inode, 0,
+ if (!hugetlb_reserve_pages(inode, 0,
size >> huge_page_shift(hstate_inode(inode)), NULL,
acctflag))
file = ERR_PTR(-ENOMEM);
@@ -1533,8 +1517,8 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
put_fs_context(fc);
}
if (IS_ERR(mnt))
- pr_err("Cannot mount internal hugetlbfs for page size %uK",
- 1U << (h->order + PAGE_SHIFT - 10));
+ pr_err("Cannot mount internal hugetlbfs for page size %luK",
+ huge_page_size(h) >> 10);
return mnt;
}
@@ -1562,7 +1546,7 @@ static int __init init_hugetlbfs_fs(void)
goto out_free;
/* default hstate mount is required */
- mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
+ mnt = mount_one_hugetlbfs(&default_hstate);
if (IS_ERR(mnt)) {
error = PTR_ERR(mnt);
goto out_unreg;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4435dbbc0b63..f5c058b3192c 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -629,6 +629,12 @@ static int ntfs_read_locked_inode(struct inode *vi)
}
a = ctx->attr;
/* Get the standard information attribute value. */
+ if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset)
+ + le32_to_cpu(a->data.resident.value_length) >
+ (u8 *)ctx->mrec + vol->mft_record_size) {
+ ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode.");
+ goto unm_err_out;
+ }
si = (STANDARD_INFORMATION*)((u8*)a +
le16_to_cpu(a->data.resident.value_offset));
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 85422761ff43..5d4bf7a3259f 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -703,7 +703,7 @@ typedef struct {
/* 14*/ le16 instance; /* The instance of this attribute record. This
number is unique within this mft record (see
MFT_RECORD/next_attribute_instance notes in
- in mft.h for more details). */
+ mft.h for more details). */
/* 16*/ union {
/* Resident attributes. */
struct {
@@ -1838,7 +1838,7 @@ typedef struct {
* Also, each security descriptor is stored twice in the $SDS stream with a
* fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
* between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
- * the first copy of the security descriptor will be at offset 0x51d0 in the
+ * first copy of the security descriptor will be at offset 0x51d0 in the
* $SDS data stream and the second copy will be at offset 0x451d0.
*/
typedef struct {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 0179a73a3fa2..12a7590601dd 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2042,7 +2042,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
o2hb_nego_timeout_handler,
reg, NULL, &reg->hr_handler_list);
if (ret)
- goto free;
+ goto remove_item;
ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
sizeof(struct o2hb_nego_msg),
@@ -2057,6 +2057,12 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
unregister_handler:
o2net_unregister_handler_list(&reg->hr_handler_list);
+remove_item:
+ spin_lock(&o2hb_live_lock);
+ list_del(&reg->hr_all_item);
+ if (o2hb_global_heartbeat_active())
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
free:
kfree(reg);
return ERR_PTR(ret);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 6abaded3ff6b..70a10764f249 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -165,16 +165,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
spin_unlock(&lock->spinlock);
}
-void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
-{
- BUG_ON(!dlm);
- BUG_ON(!lock);
-
- spin_lock(&dlm->ast_lock);
- __dlm_queue_bast(dlm, lock);
- spin_unlock(&dlm->ast_lock);
-}
-
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index c8a444622faa..58d57e25d384 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -17,10 +17,7 @@
#define DLM_LOCKID_NAME_MAX 32
-#define DLM_DOMAIN_NAME_MAX_LEN 255
#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES
-#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
-#define DLM_THREAD_MS 200 // flush at least every 200 ms
#define DLM_HASH_SIZE_DEFAULT (1 << 17)
#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
@@ -902,7 +899,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
-void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_do_local_ast(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index c26937824be1..c19a463fac55 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -978,7 +978,7 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
return 0;
}
- if (!eb || (eb && !eb->h_next_leaf_blk)) {
+ if (!eb || !eb->h_next_leaf_blk) {
/*
* We are the last extent rec, so any high cpos should
* be stored in this leaf refcount block.
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2febc76e9de7..079f8826993e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -973,8 +973,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
* quota files */
dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
DQUOT_LIMITS_ENABLED);
- if (!inode)
- continue;
iput(inode);
}
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 39c96845a72f..bfd946a9ad01 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -171,7 +171,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal);
*
* Description:
* This function grabs an extra reference to @buf. It's used in
- * in the tee() system call, when we duplicate the buffers in one
+ * the tee() system call, when we duplicate the buffers in one
* pipe into another.
*/
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d6fc74619625..6fa761c9cc78 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -129,15 +129,15 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
show_val_kb(m, "AnonHugePages: ",
- global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_ANON_THPS));
show_val_kb(m, "ShmemHugePages: ",
- global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_SHMEM_THPS));
show_val_kb(m, "ShmemPmdMapped: ",
- global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+ global_node_page_state(NR_SHMEM_PMDMAPPED));
show_val_kb(m, "FileHugePages: ",
- global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_FILE_THPS));
show_val_kb(m, "FilePmdMapped: ",
- global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
+ global_node_page_state(NR_FILE_PMDMAPPED));
#endif
#ifdef CONFIG_CMA
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index c3a345c28a93..9a15334da208 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1503,11 +1503,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
return 0;
out_err:
- if (buf)
- vfree(buf);
-
- if (dump)
- vfree(dump);
+ vfree(buf);
+ vfree(dump);
return ret;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 3c2658c8fde0..9ebd17d7befb 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -151,6 +151,18 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
return error;
}
+static int ramfs_tmpfile(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
+ if (!inode)
+ return -ENOSPC;
+ d_tmpfile(dentry, inode);
+ return 0;
+}
+
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
@@ -161,6 +173,7 @@ static const struct inode_operations ramfs_dir_inode_operations = {
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
+ .tmpfile = ramfs_tmpfile,
};
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 418b772d6eca..ec8f3ddf4a6a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3080,8 +3080,8 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
extern int generic_write_check_limits(struct file *file, loff_t pos,
loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
-extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
- struct iov_iter *to, ssize_t already_read);
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
+ ssize_t already_read);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 80544d5c08e7..220cd553a9e7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -8,6 +8,20 @@
#include <linux/linkage.h>
#include <linux/topology.h>
+/* The typedef is in types.h but we want the documentation here */
+#if 0
+/**
+ * typedef gfp_t - Memory allocation flags.
+ *
+ * GFP flags are commonly used throughout Linux to indicate how memory
+ * should be allocated. The GFP acronym stands for get_free_pages(),
+ * the underlying memory allocation function. Not every GFP flag is
+ * supported by every function which may allocate memory. Most users
+ * will want to use a plain ``GFP_KERNEL``.
+ */
+typedef unsigned int __bitwise gfp_t;
+#endif
+
struct vm_area_struct;
/*
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 1bbe96dc8be6..7902c7d8b55f 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -127,11 +127,6 @@ static inline unsigned long totalhigh_pages(void)
return (unsigned long)atomic_long_read(&_totalhigh_pages);
}
-static inline void totalhigh_pages_inc(void)
-{
- atomic_long_inc(&_totalhigh_pages);
-}
-
static inline void totalhigh_pages_add(long count)
{
atomic_long_add(count, &_totalhigh_pages);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6a19f35f836b..ba973efcd369 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -78,6 +78,7 @@ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn,
}
enum transparent_hugepage_flag {
+ TRANSPARENT_HUGEPAGE_NEVER_DAX,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
@@ -123,6 +124,13 @@ extern unsigned long transparent_hugepage_flags;
*/
static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
{
+
+ /*
+ * If the hardware/firmware marked hugepage support disabled.
+ */
+ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+ return false;
+
if (vma->vm_flags & VM_NOHUGEPAGE)
return false;
@@ -134,12 +142,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
return true;
- /*
- * For dax vmas, try to always use hugepage mappings. If the kernel does
- * not support hugepages, fsdax mappings will fallback to PAGE_SIZE
- * mappings, and device-dax namespaces, that try to guarantee a given
- * mapping size, will fail to enable
- */
+
if (vma_is_dax(vma))
return true;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b5807f23caf8..cccd1aab69dd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -37,7 +37,7 @@ struct hugepage_subpool {
struct hstate *hstate;
long min_hpages; /* Minimum huge pages or -1 if no minimum. */
long rsv_hpages; /* Pages reserved against global pool to */
- /* sasitfy minimum size. */
+ /* satisfy minimum size. */
};
struct resv_map {
@@ -139,7 +139,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
unsigned long dst_addr,
unsigned long src_addr,
struct page **pagep);
-int hugetlb_reserve_pages(struct inode *inode, long from, long to,
+bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
@@ -472,6 +472,84 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long flags);
#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
+/*
+ * huegtlb page specific state flags. These flags are located in page.private
+ * of the hugetlb head page. Functions created via the below macros should be
+ * used to manipulate these flags.
+ *
+ * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
+ * allocation time. Cleared when page is fully instantiated. Free
+ * routine checks flag to restore a reservation on error paths.
+ * Synchronization: Examined or modified by code that knows it has
+ * the only reference to page. i.e. After allocation but before use
+ * or when the page is being freed.
+ * HPG_migratable - Set after a newly allocated page is added to the page
+ * cache and/or page tables. Indicates the page is a candidate for
+ * migration.
+ * Synchronization: Initially set after new page allocation with no
+ * locking. When examined and modified during migration processing
+ * (isolate, migrate, putback) the hugetlb_lock is held.
+ * HPG_temporary - - Set on a page that is temporarily allocated from the buddy
+ * allocator. Typically used for migration target pages when no pages
+ * are available in the pool. The hugetlb free page path will
+ * immediately free pages with this flag set to the buddy allocator.
+ * Synchronization: Can be set after huge page allocation from buddy when
+ * code knows it has only reference. All other examinations and
+ * modifications require hugetlb_lock.
+ * HPG_freed - Set when page is on the free lists.
+ * Synchronization: hugetlb_lock held for examination and modification.
+ */
+enum hugetlb_page_flags {
+ HPG_restore_reserve = 0,
+ HPG_migratable,
+ HPG_temporary,
+ HPG_freed,
+ __NR_HPAGEFLAGS,
+};
+
+/*
+ * Macros to create test, set and clear function definitions for
+ * hugetlb specific page flags.
+ */
+#ifdef CONFIG_HUGETLB_PAGE
+#define TESTHPAGEFLAG(uname, flname) \
+static inline int HPage##uname(struct page *page) \
+ { return test_bit(HPG_##flname, &(page->private)); }
+
+#define SETHPAGEFLAG(uname, flname) \
+static inline void SetHPage##uname(struct page *page) \
+ { set_bit(HPG_##flname, &(page->private)); }
+
+#define CLEARHPAGEFLAG(uname, flname) \
+static inline void ClearHPage##uname(struct page *page) \
+ { clear_bit(HPG_##flname, &(page->private)); }
+#else
+#define TESTHPAGEFLAG(uname, flname) \
+static inline int HPage##uname(struct page *page) \
+ { return 0; }
+
+#define SETHPAGEFLAG(uname, flname) \
+static inline void SetHPage##uname(struct page *page) \
+ { }
+
+#define CLEARHPAGEFLAG(uname, flname) \
+static inline void ClearHPage##uname(struct page *page) \
+ { }
+#endif
+
+#define HPAGEFLAG(uname, flname) \
+ TESTHPAGEFLAG(uname, flname) \
+ SETHPAGEFLAG(uname, flname) \
+ CLEARHPAGEFLAG(uname, flname) \
+
+/*
+ * Create functions associated with hugetlb page flags
+ */
+HPAGEFLAG(RestoreReserve, restore_reserve)
+HPAGEFLAG(Migratable, migratable)
+HPAGEFLAG(Temporary, temporary)
+HPAGEFLAG(Freed, freed)
+
#ifdef CONFIG_HUGETLB_PAGE
#define HSTATE_NAME_LEN 32
@@ -531,6 +609,20 @@ extern unsigned int default_hstate_idx;
#define default_hstate (hstates[default_hstate_idx])
+/*
+ * hugetlb page subpool pointer located in hpage[1].private
+ */
+static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
+{
+ return (struct hugepage_subpool *)(hpage+1)->private;
+}
+
+static inline void hugetlb_set_page_subpool(struct page *hpage,
+ struct hugepage_subpool *subpool)
+{
+ set_page_private(hpage+1, (unsigned long)subpool);
+}
+
static inline struct hstate *hstate_file(struct file *f)
{
return hstate_inode(file_inode(f));
@@ -770,8 +862,6 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
}
#endif
-void set_page_huge_active(struct page *page);
-
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index ca5e89fb10d3..3d6d22a25bdc 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -5,6 +5,12 @@
#include <linux/types.h>
/*
+ * The annotations present in this file are only relevant for the software
+ * KASAN modes that rely on compiler instrumentation, and will be optimized
+ * away for the hardware tag-based KASAN mode. Use kasan_check_byte() instead.
+ */
+
+/*
* __kasan_check_*: Always available when KASAN is enabled. This may be used
* even in compilation units that selectively disable KASAN, but must use KASAN
* to validate access to an address. Never use these in header files!
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 0aea9e2a2a01..7eaf2d9effb4 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -185,19 +185,18 @@ static __always_inline void * __must_check kasan_init_slab_obj(
}
bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
-static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object,
- unsigned long ip)
+static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object)
{
if (kasan_enabled())
- return __kasan_slab_free(s, object, ip);
+ return __kasan_slab_free(s, object, _RET_IP_);
return false;
}
void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
-static __always_inline void kasan_slab_free_mempool(void *ptr, unsigned long ip)
+static __always_inline void kasan_slab_free_mempool(void *ptr)
{
if (kasan_enabled())
- __kasan_slab_free_mempool(ptr, ip);
+ __kasan_slab_free_mempool(ptr, _RET_IP_);
}
void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
@@ -241,12 +240,25 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
}
void __kasan_kfree_large(void *ptr, unsigned long ip);
-static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip)
+static __always_inline void kasan_kfree_large(void *ptr)
{
if (kasan_enabled())
- __kasan_kfree_large(ptr, ip);
+ __kasan_kfree_large(ptr, _RET_IP_);
}
+/*
+ * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
+ * the hardware tag-based mode that doesn't rely on compiler instrumentation.
+ */
+bool __kasan_check_byte(const void *addr, unsigned long ip);
+static __always_inline bool kasan_check_byte(const void *addr)
+{
+ if (kasan_enabled())
+ return __kasan_check_byte(addr, _RET_IP_);
+ return true;
+}
+
+
bool kasan_save_enable_multi_shot(void);
void kasan_restore_multi_shot(bool enabled);
@@ -277,12 +289,11 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
{
return (void *)object;
}
-static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
- unsigned long ip)
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
{
return false;
}
-static inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) {}
+static inline void kasan_slab_free_mempool(void *ptr) {}
static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
gfp_t flags)
{
@@ -302,7 +313,11 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
{
return (void *)object;
}
-static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
+static inline void kasan_kfree_large(void *ptr) {}
+static inline bool kasan_check_byte(const void *address)
+{
+ return true;
+}
#endif /* CONFIG_KASAN */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index eeb0b52203e9..e6dc793d587d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -92,6 +92,10 @@ struct lruvec_stat {
long count[NR_VM_NODE_STAT_ITEMS];
};
+struct batched_lruvec_stat {
+ s32 count[NR_VM_NODE_STAT_ITEMS];
+};
+
/*
* Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
* which have elements charged to this memcg.
@@ -107,11 +111,17 @@ struct memcg_shrinker_map {
struct mem_cgroup_per_node {
struct lruvec lruvec;
- /* Legacy local VM stats */
+ /*
+ * Legacy local VM stats. This should be struct lruvec_stat and
+ * cannot be optimized to struct batched_lruvec_stat. Because
+ * the threshold of the lruvec_stat_cpu can be as big as
+ * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
+ * filed has no upper limit.
+ */
struct lruvec_stat __percpu *lruvec_stat_local;
/* Subtree VM stats (batched updates) */
- struct lruvec_stat __percpu *lruvec_stat_cpu;
+ struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
@@ -475,19 +485,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
-/*
- * set_page_objcgs - associate a page with a object cgroups vector
- * @page: a pointer to the page struct
- * @objcgs: a pointer to the object cgroups vector
- *
- * Atomically associates a page with a vector of object cgroups.
- */
-static inline bool set_page_objcgs(struct page *page,
- struct obj_cgroup **objcgs)
-{
- return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs |
- MEMCG_DATA_OBJCGS);
-}
#else
static inline struct obj_cgroup **page_objcgs(struct page *page)
{
@@ -498,12 +495,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
{
return NULL;
}
-
-static inline bool set_page_objcgs(struct page *page,
- struct obj_cgroup **objcgs)
-{
- return true;
-}
#endif
static __always_inline bool memcg_stat_item_in_bytes(int idx)
@@ -689,8 +680,6 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
-struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
-
struct lruvec *lock_page_lruvec(struct page *page);
struct lruvec *lock_page_lruvec_irq(struct page *page);
struct lruvec *lock_page_lruvec_irqsave(struct page *page,
@@ -1200,11 +1189,6 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
return NULL;
}
-static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
-{
- return NULL;
-}
-
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}
@@ -1601,9 +1585,6 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
#endif
#ifdef CONFIG_MEMCG_KMEM
-int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages);
-void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 4594838a0f7c..3a389633b68f 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -89,7 +89,7 @@ extern int PageMovable(struct page *page);
extern void __SetPageMovable(struct page *page, struct address_space *mapping);
extern void __ClearPageMovable(struct page *page);
#else
-static inline int PageMovable(struct page *page) { return 0; };
+static inline int PageMovable(struct page *page) { return 0; }
static inline void __SetPageMovable(struct page *page,
struct address_space *mapping)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7deb7168104d..77e64e3eac80 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1187,6 +1187,9 @@ static inline void get_page(struct page *page)
}
bool __must_check try_grab_page(struct page *page, unsigned int flags);
+__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs,
+ unsigned int flags);
+
static inline __must_check bool try_get_page(struct page *page)
{
@@ -2310,32 +2313,20 @@ extern void free_initmem(void);
extern unsigned long free_reserved_area(void *start, void *end,
int poison, const char *s);
-#ifdef CONFIG_HIGHMEM
-/*
- * Free a highmem page into the buddy system, adjusting totalhigh_pages
- * and totalram_pages.
- */
-extern void free_highmem_page(struct page *page);
-#endif
-
extern void adjust_managed_page_count(struct page *page, long count);
extern void mem_init_print_info(const char *str);
extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
/* Free the reserved page into the buddy system, so it gets managed. */
-static inline void __free_reserved_page(struct page *page)
+static inline void free_reserved_page(struct page *page)
{
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
-}
-
-static inline void free_reserved_page(struct page *page)
-{
- __free_reserved_page(page);
adjust_managed_page_count(page, 1);
}
+#define free_highmem_page(page) free_reserved_page(page)
static inline void mark_page_reserved(struct page *page)
{
@@ -2405,9 +2396,10 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif
extern void set_dma_reserve(unsigned long new_dma_reserve);
-extern void memmap_init_zone(unsigned long, int, unsigned long,
+extern void memmap_init_range(unsigned long, int, unsigned long,
unsigned long, unsigned long, enum meminit_context,
struct vmem_altmap *, int migratetype);
+extern void memmap_init_zone(struct zone *zone);
extern void setup_per_zone_wmarks(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8fc71e9d7bb0..355ea1ee32bd 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -24,7 +24,7 @@ static inline int page_is_file_lru(struct page *page)
return !PageSwapBacked(page);
}
-static __always_inline void __update_lru_size(struct lruvec *lruvec,
+static __always_inline void update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
int nr_pages)
{
@@ -33,76 +33,27 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
-}
-
-static __always_inline void update_lru_size(struct lruvec *lruvec,
- enum lru_list lru, enum zone_type zid,
- int nr_pages)
-{
- __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}
-static __always_inline void add_page_to_lru_list(struct page *page,
- struct lruvec *lruvec, enum lru_list lru)
-{
- update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
- list_add(&page->lru, &lruvec->lists[lru]);
-}
-
-static __always_inline void add_page_to_lru_list_tail(struct page *page,
- struct lruvec *lruvec, enum lru_list lru)
-{
- update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
- list_add_tail(&page->lru, &lruvec->lists[lru]);
-}
-
-static __always_inline void del_page_from_lru_list(struct page *page,
- struct lruvec *lruvec, enum lru_list lru)
-{
- list_del(&page->lru);
- update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page));
-}
-
/**
- * page_lru_base_type - which LRU list type should a page be on?
- * @page: the page to test
- *
- * Used for LRU list index arithmetic.
- *
- * Returns the base LRU type - file or anon - @page should be on.
+ * __clear_page_lru_flags - clear page lru flags before releasing a page
+ * @page: the page that was on lru and now has a zero reference
*/
-static inline enum lru_list page_lru_base_type(struct page *page)
+static __always_inline void __clear_page_lru_flags(struct page *page)
{
- if (page_is_file_lru(page))
- return LRU_INACTIVE_FILE;
- return LRU_INACTIVE_ANON;
-}
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
-/**
- * page_off_lru - which LRU list was page on? clearing its lru flags.
- * @page: the page to test
- *
- * Returns the LRU list a page was on, as an index into the array of LRU
- * lists; and clears its Unevictable or Active flags, ready for freeing.
- */
-static __always_inline enum lru_list page_off_lru(struct page *page)
-{
- enum lru_list lru;
+ __ClearPageLRU(page);
- if (PageUnevictable(page)) {
- __ClearPageUnevictable(page);
- lru = LRU_UNEVICTABLE;
- } else {
- lru = page_lru_base_type(page);
- if (PageActive(page)) {
- __ClearPageActive(page);
- lru += LRU_ACTIVE;
- }
- }
- return lru;
+ /* this shouldn't happen, so leave the flags to bad_page() */
+ if (PageActive(page) && PageUnevictable(page))
+ return;
+
+ __ClearPageActive(page);
+ __ClearPageUnevictable(page);
}
/**
@@ -116,13 +67,41 @@ static __always_inline enum lru_list page_lru(struct page *page)
{
enum lru_list lru;
+ VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+
if (PageUnevictable(page))
- lru = LRU_UNEVICTABLE;
- else {
- lru = page_lru_base_type(page);
- if (PageActive(page))
- lru += LRU_ACTIVE;
- }
+ return LRU_UNEVICTABLE;
+
+ lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
+ if (PageActive(page))
+ lru += LRU_ACTIVE;
+
return lru;
}
+
+static __always_inline void add_page_to_lru_list(struct page *page,
+ struct lruvec *lruvec)
+{
+ enum lru_list lru = page_lru(page);
+
+ update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+ list_add(&page->lru, &lruvec->lists[lru]);
+}
+
+static __always_inline void add_page_to_lru_list_tail(struct page *page,
+ struct lruvec *lruvec)
+{
+ enum lru_list lru = page_lru(page);
+
+ update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+ list_add_tail(&page->lru, &lruvec->lists[lru]);
+}
+
+static __always_inline void del_page_from_lru_list(struct page *page,
+ struct lruvec *lruvec)
+{
+ list_del(&page->lru);
+ update_lru_size(lruvec, page_lru(page), page_zonenum(page),
+ -thp_nr_pages(page));
+}
#endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316bff3d..9198b7ade85f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -206,10 +206,30 @@ enum node_stat_item {
NR_KERNEL_SCS_KB, /* measured in KiB */
#endif
NR_PAGETABLE, /* used for pagetables */
+#ifdef CONFIG_SWAP
+ NR_SWAPCACHE,
+#endif
NR_VM_NODE_STAT_ITEMS
};
/*
+ * Returns true if the item should be printed in THPs (/proc/vmstat
+ * currently prints number of anon, file and shmem THPs. But the item
+ * is charged in pages).
+ */
+static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return false;
+
+ return item == NR_ANON_THPS ||
+ item == NR_FILE_THPS ||
+ item == NR_SHMEM_THPS ||
+ item == NR_SHMEM_PMDMAPPED ||
+ item == NR_FILE_PMDMAPPED;
+}
+
+/*
* Returns true if the value is measured in bytes (most vmstat values are
* measured in pages). This defines the API part, the internal representation
* might be different.
@@ -872,8 +892,6 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
#endif
}
-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
-
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ec5d0290e0ee..db914477057b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -592,15 +592,9 @@ static inline void ClearPageCompound(struct page *page)
#ifdef CONFIG_HUGETLB_PAGE
int PageHuge(struct page *page);
int PageHeadHuge(struct page *page);
-bool page_huge_active(struct page *page);
#else
TESTPAGEFLAG_FALSE(Huge)
TESTPAGEFLAG_FALSE(HeadHuge)
-
-static inline bool page_huge_active(struct page *page)
-{
- return 0;
-}
#endif
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 85bd413e784e..679591301994 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -12,7 +12,6 @@ struct page_counter {
unsigned long low;
unsigned long high;
unsigned long max;
- struct page_counter *parent;
/* effective memory.min and memory.min usage tracking */
unsigned long emin;
@@ -27,6 +26,14 @@ struct page_counter {
/* legacy */
unsigned long watermark;
unsigned long failcnt;
+
+ /*
+ * 'parent' is placed here to be far from 'usage' to reduce
+ * cache false sharing, as 'usage' is written mostly while
+ * parent is frequently read for cgroup's hierarchical
+ * counting nature.
+ */
+ struct page_counter *parent;
};
#if BITS_PER_LONG == 32
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d5570deff400..bd629d676a27 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -681,8 +681,7 @@ static inline int wait_on_page_locked_killable(struct page *page)
return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
-extern void put_and_wait_on_page_locked(struct page *page);
-
+int put_and_wait_on_page_locked(struct page *page, int state);
void wait_on_page_writeback(struct page *page);
extern void end_page_writeback(struct page *page);
void wait_for_stable_page(struct page *page);
@@ -757,7 +756,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void delete_from_page_cache(struct page *page);
extern void __delete_from_page_cache(struct page *page, void *shadow);
-int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
+void replace_page_cache_page(struct page *old, struct page *new);
void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3f1f7ae0fbe9..32f665b1ee85 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -356,7 +356,7 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page,
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
-extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
+extern bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
@@ -408,7 +408,11 @@ extern struct address_space *swapper_spaces[];
#define swap_address_space(entry) \
(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
>> SWAP_ADDRESS_SPACE_SHIFT])
-extern unsigned long total_swapcache_pages(void);
+static inline unsigned long total_swapcache_pages(void)
+{
+ return global_node_page_state(NR_SWAPCACHE);
+}
+
extern void show_swap_cache_info(void);
extern int add_to_swap(struct page *page);
extern void *get_shadow_from_swap_cache(swp_entry_t entry);
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index f65b1f6db22d..40845b0d5dad 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -115,7 +115,7 @@ DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node,
TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
);
-DECLARE_EVENT_CLASS(kmem_free,
+TRACE_EVENT(kfree,
TP_PROTO(unsigned long call_site, const void *ptr),
@@ -135,18 +135,26 @@ DECLARE_EVENT_CLASS(kmem_free,
(void *)__entry->call_site, __entry->ptr)
);
-DEFINE_EVENT(kmem_free, kfree,
+TRACE_EVENT(kmem_cache_free,
- TP_PROTO(unsigned long call_site, const void *ptr),
+ TP_PROTO(unsigned long call_site, const void *ptr, const char *name),
- TP_ARGS(call_site, ptr)
-);
+ TP_ARGS(call_site, ptr, name),
-DEFINE_EVENT(kmem_free, kmem_cache_free,
+ TP_STRUCT__entry(
+ __field( unsigned long, call_site )
+ __field( const void *, ptr )
+ __field( const char *, name )
+ ),
- TP_PROTO(unsigned long call_site, const void *ptr),
+ TP_fast_assign(
+ __entry->call_site = call_site;
+ __entry->ptr = ptr;
+ __entry->name = name;
+ ),
- TP_ARGS(call_site, ptr)
+ TP_printk("call_site=%pS ptr=%p name=%s",
+ (void *)__entry->call_site, __entry->ptr, __entry->name)
);
TRACE_EVENT(mm_page_free,
diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h
index 8fd1babae761..e1735fe7c76a 100644
--- a/include/trace/events/pagemap.h
+++ b/include/trace/events/pagemap.h
@@ -27,24 +27,21 @@
TRACE_EVENT(mm_lru_insertion,
- TP_PROTO(
- struct page *page,
- int lru
- ),
+ TP_PROTO(struct page *page),
- TP_ARGS(page, lru),
+ TP_ARGS(page),
TP_STRUCT__entry(
__field(struct page *, page )
__field(unsigned long, pfn )
- __field(int, lru )
+ __field(enum lru_list, lru )
__field(unsigned long, flags )
),
TP_fast_assign(
__entry->page = page;
__entry->pfn = page_to_pfn(page);
- __entry->lru = lru;
+ __entry->lru = page_lru(page);
__entry->flags = trace_pagemap_flags(page);
),
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 3354774af61e..8948467b3992 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -28,12 +28,14 @@ enum {
/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
+#define MPOL_F_NUMA_BALANCING (1 << 13) /* Optimize with NUMA balancing if possible */
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
* either set_mempolicy() or mbind().
*/
-#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
+#define MPOL_MODE_FLAGS \
+ (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING)
/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
diff --git a/init/Kconfig b/init/Kconfig
index c2a18f56e944..0bf5b340b80e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1861,20 +1861,6 @@ config SLUB_DEBUG
SLUB sysfs support. /sys/slab will not exist and there will be
no support for cache validation etc.
-config SLUB_MEMCG_SYSFS_ON
- default n
- bool "Enable memcg SLUB sysfs support by default" if EXPERT
- depends on SLUB && SYSFS && MEMCG
- help
- SLUB creates a directory under /sys/kernel/slab for each
- allocation cache to host info and debug files. If memory
- cgroup is enabled, each cache can have per memory cgroup
- caches. SLUB can create the same sysfs directories for these
- caches under /sys/kernel/slab/CACHE/cgroup but it can lead
- to a very high number of debug files being created. This is
- controlled by slub_memcg_sysfs boot parameter and this
- config option determines the parameter's default value.
-
config COMPAT_BRK
bool "Disable heap randomization"
default y
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index f5fa4ba126bf..624ae1df7984 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -190,11 +190,11 @@ config KASAN_KUNIT_TEST
kernel debugging features like KASAN.
For more information on KUnit and unit tests in general, please refer
- to the KUnit documentation in Documentation/dev-tools/kunit
+ to the KUnit documentation in Documentation/dev-tools/kunit.
-config TEST_KASAN_MODULE
+config KASAN_MODULE_TEST
tristate "KUnit-incompatible tests of KASAN bug detection capabilities"
- depends on m && KASAN
+ depends on m && KASAN && !KASAN_HW_TAGS
help
This is a part of the KASAN test suite that is incompatible with
KUnit. Currently includes tests that do bad copy_from/to_user
diff --git a/lib/Makefile b/lib/Makefile
index fb7d946bb8c3..b5307d3eec1a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -66,7 +66,7 @@ obj-$(CONFIG_TEST_IDA) += test_ida.o
obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o
CFLAGS_test_kasan.o += -fno-builtin
CFLAGS_test_kasan.o += $(call cc-disable-warning, vla)
-obj-$(CONFIG_TEST_KASAN_MODULE) += test_kasan_module.o
+obj-$(CONFIG_KASAN_MODULE_TEST) += test_kasan_module.o
CFLAGS_test_kasan_module.o += -fno-builtin
obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o
CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla)
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 2947274cc2d3..25576303897b 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -13,6 +13,7 @@
#include <linux/mman.h>
#include <linux/module.h>
#include <linux/printk.h>
+#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -28,10 +29,9 @@
#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
/*
- * We assign some test results to these globals to make sure the tests
- * are not eliminated as dead code.
+ * Some tests use these global variables to store return values from function
+ * calls that could otherwise be eliminated by the compiler as dead code.
*/
-
void *kasan_ptr_result;
int kasan_int_result;
@@ -39,40 +39,81 @@ static struct kunit_resource resource;
static struct kunit_kasan_expectation fail_data;
static bool multishot;
+/*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the
+ * first detected bug and panic the kernel if panic_on_warn is enabled. For
+ * hardware tag-based KASAN also allow tag checking to be reenabled for each
+ * test, see the comment for KUNIT_EXPECT_KASAN_FAIL().
+ */
static int kasan_test_init(struct kunit *test)
{
- /*
- * Temporarily enable multi-shot mode and set panic_on_warn=0.
- * Otherwise, we'd only get a report for the first case.
- */
- multishot = kasan_save_enable_multi_shot();
+ if (!kasan_enabled()) {
+ kunit_err(test, "can't run KASAN tests with KASAN disabled");
+ return -1;
+ }
+ multishot = kasan_save_enable_multi_shot();
+ kasan_set_tagging_report_once(false);
return 0;
}
static void kasan_test_exit(struct kunit *test)
{
+ kasan_set_tagging_report_once(true);
kasan_restore_multi_shot(multishot);
}
/**
- * KUNIT_EXPECT_KASAN_FAIL() - Causes a test failure when the expression does
- * not cause a KASAN error. This uses a KUnit resource named "kasan_data." Do
- * Do not use this name for a KUnit resource outside here.
+ * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a
+ * KASAN report; causes a test failure otherwise. This relies on a KUnit
+ * resource named "kasan_data". Do not use this name for KUnit resources
+ * outside of KASAN tests.
*
+ * For hardware tag-based KASAN, when a tag fault happens, tag checking is
+ * normally auto-disabled. When this happens, this test handler reenables
+ * tag checking. As tag checking can be only disabled or enabled per CPU, this
+ * handler disables migration (preemption).
+ *
+ * Since the compiler doesn't see that the expression can change the fail_data
+ * fields, it can reorder or optimize away the accesses to those fields.
+ * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the
+ * expression to prevent that.
*/
-#define KUNIT_EXPECT_KASAN_FAIL(test, condition) do { \
- fail_data.report_expected = true; \
- fail_data.report_found = false; \
- kunit_add_named_resource(test, \
- NULL, \
- NULL, \
- &resource, \
- "kasan_data", &fail_data); \
- condition; \
- KUNIT_EXPECT_EQ(test, \
- fail_data.report_expected, \
- fail_data.report_found); \
+#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) \
+ migrate_disable(); \
+ WRITE_ONCE(fail_data.report_expected, true); \
+ WRITE_ONCE(fail_data.report_found, false); \
+ kunit_add_named_resource(test, \
+ NULL, \
+ NULL, \
+ &resource, \
+ "kasan_data", &fail_data); \
+ barrier(); \
+ expression; \
+ barrier(); \
+ KUNIT_EXPECT_EQ(test, \
+ READ_ONCE(fail_data.report_expected), \
+ READ_ONCE(fail_data.report_found)); \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) { \
+ if (READ_ONCE(fail_data.report_found)) \
+ kasan_enable_tagging(); \
+ migrate_enable(); \
+ } \
+} while (0)
+
+#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \
+ if (!IS_ENABLED(config)) { \
+ kunit_info((test), "skipping, " #config " required"); \
+ return; \
+ } \
+} while (0)
+
+#define KASAN_TEST_NEEDS_CONFIG_OFF(test, config) do { \
+ if (IS_ENABLED(config)) { \
+ kunit_info((test), "skipping, " #config " enabled"); \
+ return; \
+ } \
} while (0)
static void kmalloc_oob_right(struct kunit *test)
@@ -111,23 +152,24 @@ static void kmalloc_node_oob_right(struct kunit *test)
kfree(ptr);
}
+/*
+ * These kmalloc_pagealloc_* tests try allocating a memory chunk that doesn't
+ * fit into a slab cache and therefore is allocated via the page allocator
+ * fallback. Since this kind of fallback is only implemented for SLUB, these
+ * tests are limited to that allocator.
+ */
static void kmalloc_pagealloc_oob_right(struct kunit *test)
{
char *ptr;
size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
- if (!IS_ENABLED(CONFIG_SLUB)) {
- kunit_info(test, "CONFIG_SLUB is not enabled.");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
- /* Allocate a chunk that does not fit into a SLUB cache to trigger
- * the page allocator fallback.
- */
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 0);
+
kfree(ptr);
}
@@ -136,15 +178,12 @@ static void kmalloc_pagealloc_uaf(struct kunit *test)
char *ptr;
size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
- if (!IS_ENABLED(CONFIG_SLUB)) {
- kunit_info(test, "CONFIG_SLUB is not enabled.");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
-
kfree(ptr);
+
KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0);
}
@@ -153,10 +192,7 @@ static void kmalloc_pagealloc_invalid_free(struct kunit *test)
char *ptr;
size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
- if (!IS_ENABLED(CONFIG_SLUB)) {
- kunit_info(test, "CONFIG_SLUB is not enabled.");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -164,11 +200,49 @@ static void kmalloc_pagealloc_invalid_free(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, kfree(ptr + 1));
}
+static void pagealloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ size_t order = 4;
+ size_t size = (1UL << (PAGE_SHIFT + order));
+
+ /*
+ * With generic KASAN page allocations have no redzones, thus
+ * out-of-bounds detection is not guaranteed.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=210503.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
+ free_pages((unsigned long)ptr, order);
+}
+
+static void pagealloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ size_t order = 4;
+
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ free_pages((unsigned long)ptr, order);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0);
+}
+
static void kmalloc_large_oob_right(struct kunit *test)
{
char *ptr;
size_t size = KMALLOC_MAX_CACHE_SIZE - 256;
- /* Allocate a chunk that is large enough, but still fits into a slab
+
+ /*
+ * Allocate a chunk that is large enough, but still fits into a slab
* and does not trigger the page allocator fallback in SLUB.
*/
ptr = kmalloc(size, GFP_KERNEL);
@@ -217,10 +291,7 @@ static void kmalloc_oob_16(struct kunit *test)
} *ptr1, *ptr2;
/* This test is specifically crafted for the generic mode. */
- if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) {
- kunit_info(test, "CONFIG_KASAN_GENERIC required\n");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
@@ -355,7 +426,9 @@ static void kmalloc_uaf2(struct kunit *test)
{
char *ptr1, *ptr2;
size_t size = 43;
+ int counter = 0;
+again:
ptr1 = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
@@ -364,6 +437,15 @@ static void kmalloc_uaf2(struct kunit *test)
ptr2 = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ /*
+ * For tag-based KASAN ptr1 and ptr2 tags might happen to be the same.
+ * Allow up to 16 attempts at generating different tags.
+ */
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && ptr1 == ptr2 && counter++ < 16) {
+ kfree(ptr2);
+ goto again;
+ }
+
KUNIT_EXPECT_KASAN_FAIL(test, ptr1[40] = 'x');
KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2);
@@ -402,10 +484,11 @@ static void kmem_cache_oob(struct kunit *test)
{
char *p;
size_t size = 200;
- struct kmem_cache *cache = kmem_cache_create("test_cache",
- size, 0,
- 0, NULL);
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
p = kmem_cache_alloc(cache, GFP_KERNEL);
if (!p) {
kunit_err(test, "Allocation failed: %s\n", __func__);
@@ -414,11 +497,12 @@ static void kmem_cache_oob(struct kunit *test)
}
KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]);
+
kmem_cache_free(cache, p);
kmem_cache_destroy(cache);
}
-static void memcg_accounted_kmem_cache(struct kunit *test)
+static void kmem_cache_accounted(struct kunit *test)
{
int i;
char *p;
@@ -445,6 +529,31 @@ free_cache:
kmem_cache_destroy(cache);
}
+static void kmem_cache_bulk(struct kunit *test)
+{
+ struct kmem_cache *cache;
+ size_t size = 200;
+ char *p[10];
+ bool ret;
+ int i;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ ret = kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), (void **)&p);
+ if (!ret) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(p); i++)
+ p[i][0] = p[i][size - 1] = 42;
+
+ kmem_cache_free_bulk(cache, ARRAY_SIZE(p), (void **)&p);
+ kmem_cache_destroy(cache);
+}
+
static char global_array[10];
static void kasan_global_oob(struct kunit *test)
@@ -453,14 +562,12 @@ static void kasan_global_oob(struct kunit *test)
char *p = &global_array[ARRAY_SIZE(global_array) + i];
/* Only generic mode instruments globals. */
- if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) {
- kunit_info(test, "CONFIG_KASAN_GENERIC required");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
}
+/* Check that ksize() makes the whole object accessible. */
static void ksize_unpoisons_memory(struct kunit *test)
{
char *ptr;
@@ -469,11 +576,32 @@ static void ksize_unpoisons_memory(struct kunit *test)
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
real_size = ksize(ptr);
- /* This access doesn't trigger an error. */
+
+ /* This access shouldn't trigger a KASAN report. */
ptr[size] = 'x';
- /* This one does. */
+
+ /* This one must. */
KUNIT_EXPECT_KASAN_FAIL(test, ptr[real_size] = 'y');
+
+ kfree(ptr);
+}
+
+/*
+ * Check that a use-after-free is detected by ksize() and via normal accesses
+ * after it.
+ */
+static void ksize_uaf(struct kunit *test)
+{
+ char *ptr;
+ int size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *(ptr + size));
}
static void kasan_stack_oob(struct kunit *test)
@@ -482,10 +610,7 @@ static void kasan_stack_oob(struct kunit *test)
volatile int i = OOB_TAG_OFF;
char *p = &stack_array[ARRAY_SIZE(stack_array) + i];
- if (!IS_ENABLED(CONFIG_KASAN_STACK)) {
- kunit_info(test, "CONFIG_KASAN_STACK is not enabled");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
}
@@ -497,15 +622,8 @@ static void kasan_alloca_oob_left(struct kunit *test)
char *p = alloca_array - 1;
/* Only generic mode instruments dynamic allocas. */
- if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) {
- kunit_info(test, "CONFIG_KASAN_GENERIC required");
- return;
- }
-
- if (!IS_ENABLED(CONFIG_KASAN_STACK)) {
- kunit_info(test, "CONFIG_KASAN_STACK is not enabled");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
}
@@ -517,15 +635,8 @@ static void kasan_alloca_oob_right(struct kunit *test)
char *p = alloca_array + i;
/* Only generic mode instruments dynamic allocas. */
- if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) {
- kunit_info(test, "CONFIG_KASAN_GENERIC required");
- return;
- }
-
- if (!IS_ENABLED(CONFIG_KASAN_STACK)) {
- kunit_info(test, "CONFIG_KASAN_STACK is not enabled");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
}
@@ -568,7 +679,7 @@ static void kmem_cache_invalid_free(struct kunit *test)
return;
}
- /* Trigger invalid free, the object doesn't get freed */
+ /* Trigger invalid free, the object doesn't get freed. */
KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p + 1));
/*
@@ -585,12 +696,11 @@ static void kasan_memchr(struct kunit *test)
char *ptr;
size_t size = 24;
- /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- kunit_info(test,
- "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT");
- return;
- }
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
if (OOB_TAG_OFF)
size = round_up(size, OOB_TAG_OFF);
@@ -610,12 +720,11 @@ static void kasan_memcmp(struct kunit *test)
size_t size = 24;
int arr[9];
- /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- kunit_info(test,
- "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT");
- return;
- }
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
if (OOB_TAG_OFF)
size = round_up(size, OOB_TAG_OFF);
@@ -634,12 +743,11 @@ static void kasan_strings(struct kunit *test)
char *ptr;
size_t size = 24;
- /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- kunit_info(test,
- "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT");
- return;
- }
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -700,13 +808,10 @@ static void kasan_bitops_generic(struct kunit *test)
long *bits;
/* This test is specifically crafted for the generic mode. */
- if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) {
- kunit_info(test, "CONFIG_KASAN_GENERIC required\n");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
/*
- * Allocate 1 more byte, which causes kzalloc to round up to 16-bytes;
+ * Allocate 1 more byte, which causes kzalloc to round up to 16 bytes;
* this way we do not actually corrupt other memory.
*/
bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL);
@@ -731,19 +836,16 @@ static void kasan_bitops_tags(struct kunit *test)
{
long *bits;
- /* This test is specifically crafted for the tag-based mode. */
- if (IS_ENABLED(CONFIG_KASAN_GENERIC)) {
- kunit_info(test, "CONFIG_KASAN_SW_TAGS required\n");
- return;
- }
+ /* This test is specifically crafted for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
- /* Allocation size will be rounded to up granule size, which is 16. */
- bits = kzalloc(sizeof(*bits), GFP_KERNEL);
+ /* kmalloc-64 cache will be used and the last 16 bytes will be the redzone. */
+ bits = kzalloc(48, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits);
- /* Do the accesses past the 16 allocated bytes. */
- kasan_bitops_modify(test, BITS_PER_LONG, &bits[1]);
- kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, &bits[1]);
+ /* Do the accesses past the 48 allocated bytes, but within the redone. */
+ kasan_bitops_modify(test, BITS_PER_LONG, (void *)bits + 48);
+ kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, (void *)bits + 48);
kfree(bits);
}
@@ -764,10 +866,7 @@ static void vmalloc_oob(struct kunit *test)
{
void *area;
- if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
- kunit_info(test, "CONFIG_KASAN_VMALLOC is not enabled.");
- return;
- }
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
/*
* We have to be careful not to hit the guard page.
@@ -780,6 +879,94 @@ static void vmalloc_oob(struct kunit *test)
vfree(area);
}
+/*
+ * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN,
+ * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based
+ * modes.
+ */
+static void match_all_not_assigned(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ int i, size, order;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ for (i = 0; i < 256; i++) {
+ size = (get_random_int() % 1024) + 1;
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ kfree(ptr);
+ }
+
+ for (i = 0; i < 256; i++) {
+ order = (get_random_int() % 4) + 1;
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ free_pages((unsigned long)ptr, order);
+ }
+}
+
+/* Check that 0xff works as a match-all pointer tag for tag-based modes. */
+static void match_all_ptr_tag(struct kunit *test)
+{
+ char *ptr;
+ u8 tag;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(128, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ /* Backup the assigned tag. */
+ tag = get_tag(ptr);
+ KUNIT_EXPECT_NE(test, tag, (u8)KASAN_TAG_KERNEL);
+
+ /* Reset the tag to 0xff.*/
+ ptr = set_tag(ptr, KASAN_TAG_KERNEL);
+
+ /* This access shouldn't trigger a KASAN report. */
+ *ptr = 0;
+
+ /* Recover the pointer tag and free. */
+ ptr = set_tag(ptr, tag);
+ kfree(ptr);
+}
+
+/* Check that there are no match-all memory tags for tag-based modes. */
+static void match_all_mem_tag(struct kunit *test)
+{
+ char *ptr;
+ int tag;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(128, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_NE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* For each possible tag value not matching the pointer tag. */
+ for (tag = KASAN_TAG_MIN; tag <= KASAN_TAG_KERNEL; tag++) {
+ if (tag == get_tag(ptr))
+ continue;
+
+ /* Mark the first memory granule with the chosen memory tag. */
+ kasan_poison(ptr, KASAN_GRANULE_SIZE, (u8)tag);
+
+ /* This access must cause a KASAN report. */
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr = 0);
+ }
+
+ /* Recover the memory tag and free. */
+ kasan_poison(ptr, KASAN_GRANULE_SIZE, get_tag(ptr));
+ kfree(ptr);
+}
+
static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmalloc_oob_right),
KUNIT_CASE(kmalloc_oob_left),
@@ -787,6 +974,8 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmalloc_pagealloc_oob_right),
KUNIT_CASE(kmalloc_pagealloc_uaf),
KUNIT_CASE(kmalloc_pagealloc_invalid_free),
+ KUNIT_CASE(pagealloc_oob_right),
+ KUNIT_CASE(pagealloc_uaf),
KUNIT_CASE(kmalloc_large_oob_right),
KUNIT_CASE(kmalloc_oob_krealloc_more),
KUNIT_CASE(kmalloc_oob_krealloc_less),
@@ -804,12 +993,14 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kfree_via_page),
KUNIT_CASE(kfree_via_phys),
KUNIT_CASE(kmem_cache_oob),
- KUNIT_CASE(memcg_accounted_kmem_cache),
+ KUNIT_CASE(kmem_cache_accounted),
+ KUNIT_CASE(kmem_cache_bulk),
KUNIT_CASE(kasan_global_oob),
KUNIT_CASE(kasan_stack_oob),
KUNIT_CASE(kasan_alloca_oob_left),
KUNIT_CASE(kasan_alloca_oob_right),
KUNIT_CASE(ksize_unpoisons_memory),
+ KUNIT_CASE(ksize_uaf),
KUNIT_CASE(kmem_cache_double_free),
KUNIT_CASE(kmem_cache_invalid_free),
KUNIT_CASE(kasan_memchr),
@@ -819,6 +1010,9 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kasan_bitops_tags),
KUNIT_CASE(kmalloc_double_kzfree),
KUNIT_CASE(vmalloc_oob),
+ KUNIT_CASE(match_all_not_assigned),
+ KUNIT_CASE(match_all_ptr_tag),
+ KUNIT_CASE(match_all_mem_tag),
{}
};
diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c
index 3b4cc77992d2..eee017ff8980 100644
--- a/lib/test_kasan_module.c
+++ b/lib/test_kasan_module.c
@@ -123,8 +123,9 @@ static noinline void __init kasan_workqueue_uaf(void)
static int __init test_kasan_module_init(void)
{
/*
- * Temporarily enable multi-shot mode. Otherwise, we'd only get a
- * report for the first case.
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only
+ * report the first detected bug and panic the kernel if panic_on_warn
+ * is enabled.
*/
bool multishot = kasan_save_enable_multi_shot();
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e33797579338..eca555f658d9 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -32,6 +32,8 @@ LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -69,7 +71,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
global_dirty_limits(&background_thresh, &dirty_thresh);
wb_thresh = wb_calc_thresh(wb, dirty_thresh);
-#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
"BdiWriteback: %10lu kB\n"
"BdiReclaimable: %10lu kB\n"
@@ -98,7 +99,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_more_io,
nr_dirty_time,
!list_empty(&bdi->bdi_list), bdi->wb.state);
-#undef K
return 0;
}
@@ -146,8 +146,6 @@ static ssize_t read_ahead_kb_store(struct device *dev,
return count;
}
-#define K(pages) ((pages) << (PAGE_SHIFT - 10))
-
#define BDI_SHOW(name, expr) \
static ssize_t name##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
diff --git a/mm/compaction.c b/mm/compaction.c
index 190ccdaa6c19..e04f4476e68e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -137,7 +137,6 @@ EXPORT_SYMBOL(__SetPageMovable);
void __ClearPageMovable(struct page *page)
{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageMovable(page), page);
/*
* Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
@@ -988,14 +987,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(!get_page_unless_zero(page)))
goto isolate_fail;
- if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
+ if (!__isolate_lru_page_prepare(page, isolate_mode))
goto isolate_fail_put;
/* Try isolate the page */
if (!TestClearPageLRU(page))
goto isolate_fail_put;
- rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
/* If we already hold the lock, we can skip some rechecking */
@@ -1005,7 +1003,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
locked = lruvec;
- rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
@@ -1026,15 +1023,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
SetPageLRU(page);
goto isolate_fail_put;
}
- } else
- rcu_read_unlock();
+ }
/* The whole page is taken off the LRU; skip the tail pages. */
if (PageCompound(page))
low_pfn += compound_nr(page) - 1;
/* Successfully isolated */
- del_page_from_lru_list(page, lruvec, page_lru(page));
+ del_page_from_lru_list(page, lruvec);
mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_is_file_lru(page),
thp_nr_pages(page));
@@ -1288,7 +1284,7 @@ static void
fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
{
unsigned long start_pfn, end_pfn;
- struct page *page = pfn_to_page(pfn);
+ struct page *page;
/* Do not search around if there are enough pages already */
if (cc->nr_freepages >= cc->nr_migratepages)
@@ -1299,8 +1295,12 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
return;
/* Pageblock boundaries */
- start_pfn = pageblock_start_pfn(pfn);
- end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
+ start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn);
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone));
+
+ page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone);
+ if (!page)
+ return;
/* Scan before */
if (start_pfn != pfn) {
@@ -1402,7 +1402,8 @@ fast_isolate_freepages(struct compact_control *cc)
pfn = page_to_pfn(freepage);
if (pfn >= highest)
- highest = pageblock_start_pfn(pfn);
+ highest = max(pageblock_start_pfn(pfn),
+ cc->zone->zone_start_pfn);
if (pfn >= low_pfn) {
cc->fast_search_fail = 0;
@@ -1472,7 +1473,8 @@ fast_isolate_freepages(struct compact_control *cc)
} else {
if (cc->direct_compaction && pfn_valid(min_pfn)) {
page = pageblock_pfn_to_page(min_pfn,
- pageblock_end_pfn(min_pfn),
+ min(pageblock_end_pfn(min_pfn),
+ zone_end_pfn(cc->zone)),
cc->zone);
cc->free_pfn = min_pfn;
}
@@ -1702,6 +1704,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
unsigned long pfn = cc->migrate_pfn;
unsigned long high_pfn;
int order;
+ bool found_block = false;
/* Skip hints are relied on to avoid repeats on the fast search */
if (cc->ignore_skip_hint)
@@ -1744,7 +1747,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
for (order = cc->order - 1;
- order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+ order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit;
order--) {
struct free_area *area = &cc->zone->free_area[order];
struct list_head *freelist;
@@ -1759,7 +1762,11 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
list_for_each_entry(freepage, freelist, lru) {
unsigned long free_pfn;
- nr_scanned++;
+ if (nr_scanned++ >= limit) {
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+
free_pfn = page_to_pfn(freepage);
if (free_pfn < high_pfn) {
/*
@@ -1768,12 +1775,8 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* the list assumes an entry is deleted, not
* reordered.
*/
- if (get_pageblock_skip(freepage)) {
- if (list_is_last(freelist, &freepage->lru))
- break;
-
+ if (get_pageblock_skip(freepage))
continue;
- }
/* Reorder to so a future search skips recent pages */
move_freelist_tail(freelist, freepage);
@@ -1781,15 +1784,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
update_fast_start_pfn(cc, free_pfn);
pfn = pageblock_start_pfn(free_pfn);
cc->fast_search_fail = 0;
+ found_block = true;
set_pageblock_skip(freepage);
break;
}
-
- if (nr_scanned >= limit) {
- cc->fast_search_fail++;
- move_freelist_tail(freelist, freepage);
- break;
- }
}
spin_unlock_irqrestore(&cc->zone->lock, flags);
}
@@ -1800,9 +1798,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* If fast scanning failed then use a cached entry for a page block
* that had free pages as the basis for starting a linear scan.
*/
- if (pfn == cc->migrate_pfn)
+ if (!found_block) {
+ cc->fast_search_fail++;
pfn = reinit_migrate_pfn(cc);
-
+ }
return pfn;
}
@@ -1926,20 +1925,28 @@ static bool kswapd_is_running(pg_data_t *pgdat)
/*
* A zone's fragmentation score is the external fragmentation wrt to the
- * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value
- * in the range [0, 100].
+ * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
+ */
+static unsigned int fragmentation_score_zone(struct zone *zone)
+{
+ return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+}
+
+/*
+ * A weighted zone's fragmentation score is the external fragmentation
+ * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
+ * returns a value in the range [0, 100].
*
* The scaling factor ensures that proactive compaction focuses on larger
* zones like ZONE_NORMAL, rather than smaller, specialized zones like
* ZONE_DMA32. For smaller zones, the score value remains close to zero,
* and thus never exceeds the high threshold for proactive compaction.
*/
-static unsigned int fragmentation_score_zone(struct zone *zone)
+static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
{
unsigned long score;
- score = zone->present_pages *
- extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+ score = zone->present_pages * fragmentation_score_zone(zone);
return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
}
@@ -1959,7 +1966,7 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
struct zone *zone;
zone = &pgdat->node_zones[zoneid];
- score += fragmentation_score_zone(zone);
+ score += fragmentation_score_zone_weighted(zone);
}
return score;
diff --git a/mm/debug.c b/mm/debug.c
index 8a40b3fefbeb..0bdda8407f71 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -110,6 +110,11 @@ void __dump_page(struct page *page, const char *reason)
head_compound_mapcount(head));
}
}
+
+#ifdef CONFIG_MEMCG
+ if (head->memcg_data)
+ pr_warn("memcg:%lx\n", head->memcg_data);
+#endif
if (PageKsm(page))
type = "ksm ";
else if (PageAnon(page))
@@ -180,11 +185,6 @@ hex_only:
if (reason)
pr_warn("page dumped because: %s\n", reason);
-
-#ifdef CONFIG_MEMCG
- if (!page_poisoned && page->memcg_data)
- pr_warn("pages's memcg:%lx\n", page->memcg_data);
-#endif
}
void dump_page(struct page *page, const char *reason)
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c05d9dcf7891..a9bd6ce1ba02 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -58,11 +58,23 @@
#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
#define RANDOM_NZVALUE GENMASK(7, 0)
-static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_basic_tests(unsigned long pfn, int idx)
{
+ pgprot_t prot = protection_map[idx];
pte_t pte = pfn_pte(pfn, prot);
+ unsigned long val = idx, *ptr = &val;
+
+ pr_debug("Validating PTE basic (%pGv)\n", ptr);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pte() to make sure that protection_map[idx]
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pte_dirty(pte_wrprotect(pte)));
- pr_debug("Validating PTE basic\n");
WARN_ON(!pte_same(pte, pte));
WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
@@ -70,6 +82,8 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte))));
WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte))));
WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
+ WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte))));
+ WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
}
static void __init pte_advanced_tests(struct mm_struct *mm,
@@ -129,14 +143,27 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_basic_tests(unsigned long pfn, int idx)
{
+ pgprot_t prot = protection_map[idx];
pmd_t pmd = pfn_pmd(pfn, prot);
+ unsigned long val = idx, *ptr = &val;
if (!has_transparent_hugepage())
return;
- pr_debug("Validating PMD basic\n");
+ pr_debug("Validating PMD basic (%pGv)\n", ptr);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pmd() to make sure that protection_map[idx]
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pmd_dirty(pmd_wrprotect(pmd)));
+
+
WARN_ON(!pmd_same(pmd, pmd));
WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
@@ -144,6 +171,8 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd))));
WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd))));
WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd))));
+ WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd))));
+ WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd))));
/*
* A huge page does not point to next level page table
* entry. Hence this must qualify as pmd_bad().
@@ -249,19 +278,35 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx)
{
+ pgprot_t prot = protection_map[idx];
pud_t pud = pfn_pud(pfn, prot);
+ unsigned long val = idx, *ptr = &val;
if (!has_transparent_hugepage())
return;
- pr_debug("Validating PUD basic\n");
+ pr_debug("Validating PUD basic (%pGv)\n", ptr);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pud() to make sure that protection_map[idx]
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pud_dirty(pud_wrprotect(pud)));
+
WARN_ON(!pud_same(pud, pud));
WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
+ WARN_ON(!pud_dirty(pud_mkdirty(pud_mkclean(pud))));
+ WARN_ON(pud_dirty(pud_mkclean(pud_mkdirty(pud))));
WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud))));
WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud))));
+ WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud))));
+ WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud))));
if (mm_pmd_folded(mm))
return;
@@ -359,7 +404,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
static void __init pud_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pud_t *pudp,
unsigned long pfn, unsigned long vaddr,
@@ -372,8 +417,8 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
+static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
@@ -899,6 +944,7 @@ static int __init debug_vm_pgtable(void)
unsigned long vaddr, pte_aligned, pmd_aligned;
unsigned long pud_aligned, p4d_aligned, pgd_aligned;
spinlock_t *ptl = NULL;
+ int idx;
pr_info("Validating architecture page table helpers\n");
prot = vm_get_page_prot(VMFLAGS);
@@ -963,9 +1009,25 @@ static int __init debug_vm_pgtable(void)
saved_pmdp = pmd_offset(pudp, 0UL);
saved_ptep = pmd_pgtable(pmd);
- pte_basic_tests(pte_aligned, prot);
- pmd_basic_tests(pmd_aligned, prot);
- pud_basic_tests(pud_aligned, prot);
+ /*
+ * Iterate over the protection_map[] to make sure that all
+ * the basic page table transformation validations just hold
+ * true irrespective of the starting protection value for a
+ * given page table entry.
+ */
+ for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) {
+ pte_basic_tests(pte_aligned, idx);
+ pmd_basic_tests(pmd_aligned, idx);
+ pud_basic_tests(mm, pud_aligned, idx);
+ }
+
+ /*
+ * Both P4D and PGD level tests are very basic which do not
+ * involve creating page table entries from the protection
+ * value and the given pfn. Hence just keep them out from
+ * the above iteration for now to save some test execution
+ * time.
+ */
p4d_basic_tests(p4d_aligned, prot);
pgd_basic_tests(pgd_aligned, prot);
diff --git a/mm/filemap.c b/mm/filemap.c
index 6ff2a3fb0dc7..46a8b9e82434 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -206,9 +206,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
if (PageSwapBacked(page)) {
__mod_lruvec_page_state(page, NR_SHMEM, -nr);
if (PageTransHuge(page))
- __dec_lruvec_page_state(page, NR_SHMEM_THPS);
+ __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr);
} else if (PageTransHuge(page)) {
- __dec_lruvec_page_state(page, NR_FILE_THPS);
+ __mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
@@ -777,7 +777,6 @@ EXPORT_SYMBOL(file_write_and_wait_range);
* replace_page_cache_page - replace a pagecache page with a new one
* @old: page to be replaced
* @new: page to replace with
- * @gfp_mask: allocation mode
*
* This function replaces a page in the pagecache with a new one. On
* success it acquires the pagecache reference for the new page and
@@ -786,10 +785,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
* caller must do that.
*
* The remove + add is atomic. This function cannot fail.
- *
- * Return: %0
*/
-int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+void replace_page_cache_page(struct page *old, struct page *new)
{
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *) = mapping->a_ops->freepage;
@@ -824,8 +821,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (freepage)
freepage(old);
put_page(old);
-
- return 0;
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
@@ -1348,61 +1343,26 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
}
EXPORT_SYMBOL(wait_on_page_bit_killable);
-static int __wait_on_page_locked_async(struct page *page,
- struct wait_page_queue *wait, bool set)
-{
- struct wait_queue_head *q = page_waitqueue(page);
- int ret = 0;
-
- wait->page = page;
- wait->bit_nr = PG_locked;
-
- spin_lock_irq(&q->lock);
- __add_wait_queue_entry_tail(q, &wait->wait);
- SetPageWaiters(page);
- if (set)
- ret = !trylock_page(page);
- else
- ret = PageLocked(page);
- /*
- * If we were successful now, we know we're still on the
- * waitqueue as we're still under the lock. This means it's
- * safe to remove and return success, we know the callback
- * isn't going to trigger.
- */
- if (!ret)
- __remove_wait_queue(q, &wait->wait);
- else
- ret = -EIOCBQUEUED;
- spin_unlock_irq(&q->lock);
- return ret;
-}
-
-static int wait_on_page_locked_async(struct page *page,
- struct wait_page_queue *wait)
-{
- if (!PageLocked(page))
- return 0;
- return __wait_on_page_locked_async(compound_head(page), wait, false);
-}
-
/**
* put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
* @page: The page to wait for.
+ * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
*
* The caller should hold a reference on @page. They expect the page to
* become unlocked relatively soon, but do not wish to hold up migration
* (for example) by holding the reference while waiting for the page to
* come unlocked. After this function returns, the caller should not
* dereference @page.
+ *
+ * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal.
*/
-void put_and_wait_on_page_locked(struct page *page)
+int put_and_wait_on_page_locked(struct page *page, int state)
{
wait_queue_head_t *q;
page = compound_head(page);
q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+ return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
}
/**
@@ -1558,7 +1518,28 @@ EXPORT_SYMBOL_GPL(__lock_page_killable);
int __lock_page_async(struct page *page, struct wait_page_queue *wait)
{
- return __wait_on_page_locked_async(page, wait, true);
+ struct wait_queue_head *q = page_waitqueue(page);
+ int ret = 0;
+
+ wait->page = page;
+ wait->bit_nr = PG_locked;
+
+ spin_lock_irq(&q->lock);
+ __add_wait_queue_entry_tail(q, &wait->wait);
+ SetPageWaiters(page);
+ ret = !trylock_page(page);
+ /*
+ * If we were successful now, we know we're still on the
+ * waitqueue as we're still under the lock. This means it's
+ * safe to remove and return success, we know the callback
+ * isn't going to trigger.
+ */
+ if (!ret)
+ __remove_wait_queue(q, &wait->wait);
+ else
+ ret = -EIOCBQUEUED;
+ spin_unlock_irq(&q->lock);
+ return ret;
}
/*
@@ -2173,287 +2154,267 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
ra->ra_pages /= 4;
}
-static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+/*
+ * filemap_get_read_batch - Get a batch of pages for read
+ *
+ * Get a batch of pages which represent a contiguous range of bytes
+ * in the file. No tail pages will be returned. If @index is in the
+ * middle of a THP, the entire THP will be returned. The last page in
+ * the batch may have Readahead set or be not Uptodate so that the
+ * caller can take the appropriate action.
+ */
+static void filemap_get_read_batch(struct address_space *mapping,
+ pgoff_t index, pgoff_t max, struct pagevec *pvec)
{
- if (iocb->ki_flags & IOCB_WAITQ)
- return lock_page_async(page, iocb->ki_waitq);
- else if (iocb->ki_flags & IOCB_NOWAIT)
- return trylock_page(page) ? 0 : -EAGAIN;
- else
- return lock_page_killable(page);
+ XA_STATE(xas, &mapping->i_pages, index);
+ struct page *head;
+
+ rcu_read_lock();
+ for (head = xas_load(&xas); head; head = xas_next(&xas)) {
+ if (xas_retry(&xas, head))
+ continue;
+ if (xas.xa_index > max || xa_is_value(head))
+ break;
+ if (!page_cache_get_speculative(head))
+ goto retry;
+
+ /* Has the page moved or been split? */
+ if (unlikely(head != xas_reload(&xas)))
+ goto put_page;
+
+ if (!pagevec_add(pvec, head))
+ break;
+ if (!PageUptodate(head))
+ break;
+ if (PageReadahead(head))
+ break;
+ xas.xa_index = head->index + thp_nr_pages(head) - 1;
+ xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK;
+ continue;
+put_page:
+ put_page(head);
+retry:
+ xas_reset(&xas);
+ }
+ rcu_read_unlock();
}
-static struct page *
-generic_file_buffered_read_readpage(struct kiocb *iocb,
- struct file *filp,
- struct address_space *mapping,
- struct page *page)
+static int filemap_read_page(struct file *file, struct address_space *mapping,
+ struct page *page)
{
- struct file_ra_state *ra = &filp->f_ra;
int error;
- if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-EAGAIN);
- }
-
/*
- * A previous I/O error may have been due to temporary
- * failures, eg. multipath errors.
- * PG_error will be set again if readpage fails.
+ * A previous I/O error may have been due to temporary failures,
+ * eg. multipath errors. PG_error will be set again if readpage
+ * fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
- error = mapping->a_ops->readpage(filp, page);
+ error = mapping->a_ops->readpage(file, page);
+ if (error)
+ return error;
- if (unlikely(error)) {
- put_page(page);
- return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
- }
+ error = wait_on_page_locked_killable(page);
+ if (error)
+ return error;
+ if (PageUptodate(page))
+ return 0;
+ if (!page->mapping) /* page truncated */
+ return AOP_TRUNCATED_PAGE;
+ shrink_readahead_size_eio(&file->f_ra);
+ return -EIO;
+}
- if (!PageUptodate(page)) {
- error = lock_page_for_iocb(iocb, page);
- if (unlikely(error)) {
- put_page(page);
- return ERR_PTR(error);
- }
- if (!PageUptodate(page)) {
- if (page->mapping == NULL) {
- /*
- * invalidate_mapping_pages got it
- */
- unlock_page(page);
- put_page(page);
- return NULL;
- }
- unlock_page(page);
- shrink_readahead_size_eio(ra);
- put_page(page);
- return ERR_PTR(-EIO);
- }
- unlock_page(page);
+static bool filemap_range_uptodate(struct address_space *mapping,
+ loff_t pos, struct iov_iter *iter, struct page *page)
+{
+ int count;
+
+ if (PageUptodate(page))
+ return true;
+ /* pipes can't handle partially uptodate pages */
+ if (iov_iter_is_pipe(iter))
+ return false;
+ if (!mapping->a_ops->is_partially_uptodate)
+ return false;
+ if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page)))
+ return false;
+
+ count = iter->count;
+ if (page_offset(page) > pos) {
+ count -= page_offset(page) - pos;
+ pos = 0;
+ } else {
+ pos -= page_offset(page);
}
- return page;
+ return mapping->a_ops->is_partially_uptodate(page, pos, count);
}
-static struct page *
-generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
- struct file *filp,
- struct iov_iter *iter,
- struct page *page,
- loff_t pos, loff_t count)
+static int filemap_update_page(struct kiocb *iocb,
+ struct address_space *mapping, struct iov_iter *iter,
+ struct page *page)
{
- struct address_space *mapping = filp->f_mapping;
- struct inode *inode = mapping->host;
int error;
- /*
- * See comment in do_read_cache_page on why
- * wait_on_page_locked is used to avoid unnecessarily
- * serialisations and why it's safe.
- */
- if (iocb->ki_flags & IOCB_WAITQ) {
- error = wait_on_page_locked_async(page,
- iocb->ki_waitq);
- } else {
- error = wait_on_page_locked_killable(page);
- }
- if (unlikely(error)) {
- put_page(page);
- return ERR_PTR(error);
+ if (!trylock_page(page)) {
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
+ return -EAGAIN;
+ if (!(iocb->ki_flags & IOCB_WAITQ)) {
+ put_and_wait_on_page_locked(page, TASK_KILLABLE);
+ return AOP_TRUNCATED_PAGE;
+ }
+ error = __lock_page_async(page, iocb->ki_waitq);
+ if (error)
+ return error;
}
- if (PageUptodate(page))
- return page;
- if (inode->i_blkbits == PAGE_SHIFT ||
- !mapping->a_ops->is_partially_uptodate)
- goto page_not_up_to_date;
- /* pipes can't handle partially uptodate pages */
- if (unlikely(iov_iter_is_pipe(iter)))
- goto page_not_up_to_date;
- if (!trylock_page(page))
- goto page_not_up_to_date;
- /* Did it get truncated before we got the lock? */
if (!page->mapping)
- goto page_not_up_to_date_locked;
- if (!mapping->a_ops->is_partially_uptodate(page,
- pos & ~PAGE_MASK, count))
- goto page_not_up_to_date_locked;
- unlock_page(page);
- return page;
+ goto truncated;
-page_not_up_to_date:
- /* Get exclusive access to the page ... */
- error = lock_page_for_iocb(iocb, page);
- if (unlikely(error)) {
- put_page(page);
- return ERR_PTR(error);
- }
-
-page_not_up_to_date_locked:
- /* Did it get truncated before we got the lock? */
- if (!page->mapping) {
- unlock_page(page);
- put_page(page);
- return NULL;
- }
+ error = 0;
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
+ goto unlock;
- /* Did somebody else fill it already? */
- if (PageUptodate(page)) {
- unlock_page(page);
- return page;
- }
+ error = -EAGAIN;
+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
+ goto unlock;
- return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+ error = filemap_read_page(iocb->ki_filp, mapping, page);
+ if (error == AOP_TRUNCATED_PAGE)
+ put_page(page);
+ return error;
+truncated:
+ unlock_page(page);
+ put_page(page);
+ return AOP_TRUNCATED_PAGE;
+unlock:
+ unlock_page(page);
+ return error;
}
-static struct page *
-generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
- struct iov_iter *iter)
+static int filemap_create_page(struct file *file,
+ struct address_space *mapping, pgoff_t index,
+ struct pagevec *pvec)
{
- struct file *filp = iocb->ki_filp;
- struct address_space *mapping = filp->f_mapping;
- pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
struct page *page;
int error;
- if (iocb->ki_flags & IOCB_NOIO)
- return ERR_PTR(-EAGAIN);
-
- /*
- * Ok, it wasn't cached, so we need to create a new
- * page..
- */
page = page_cache_alloc(mapping);
if (!page)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
error = add_to_page_cache_lru(page, mapping, index,
- mapping_gfp_constraint(mapping, GFP_KERNEL));
- if (error) {
- put_page(page);
- return error != -EEXIST ? ERR_PTR(error) : NULL;
- }
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
+ if (error == -EEXIST)
+ error = AOP_TRUNCATED_PAGE;
+ if (error)
+ goto error;
- return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+ error = filemap_read_page(file, mapping, page);
+ if (error)
+ goto error;
+
+ pagevec_add(pvec, page);
+ return 0;
+error:
+ put_page(page);
+ return error;
}
-static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
- struct iov_iter *iter,
- struct page **pages,
- unsigned int nr)
+static int filemap_readahead(struct kiocb *iocb, struct file *file,
+ struct address_space *mapping, struct page *page,
+ pgoff_t last_index)
+{
+ if (iocb->ki_flags & IOCB_NOIO)
+ return -EAGAIN;
+ page_cache_async_readahead(mapping, &file->f_ra, file, page,
+ page->index, last_index - page->index);
+ return 0;
+}
+
+static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
+ struct pagevec *pvec)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
- pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
- int i, j, nr_got, err = 0;
+ pgoff_t last_index;
+ struct page *page;
+ int err = 0;
- nr = min_t(unsigned long, last_index - index, nr);
-find_page:
+ last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
+retry:
if (fatal_signal_pending(current))
return -EINTR;
- nr_got = find_get_pages_contig(mapping, index, nr, pages);
- if (nr_got)
- goto got_pages;
-
- if (iocb->ki_flags & IOCB_NOIO)
- return -EAGAIN;
-
- page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
-
- nr_got = find_get_pages_contig(mapping, index, nr, pages);
- if (nr_got)
- goto got_pages;
-
- pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
- err = PTR_ERR_OR_ZERO(pages[0]);
- if (!IS_ERR_OR_NULL(pages[0]))
- nr_got = 1;
-got_pages:
- for (i = 0; i < nr_got; i++) {
- struct page *page = pages[i];
- pgoff_t pg_index = index + i;
- loff_t pg_pos = max(iocb->ki_pos,
- (loff_t) pg_index << PAGE_SHIFT);
- loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
-
- if (PageReadahead(page)) {
- if (iocb->ki_flags & IOCB_NOIO) {
- for (j = i; j < nr_got; j++)
- put_page(pages[j]);
- nr_got = i;
- err = -EAGAIN;
- break;
- }
- page_cache_async_readahead(mapping, ra, filp, page,
- pg_index, last_index - pg_index);
- }
-
- if (!PageUptodate(page)) {
- if ((iocb->ki_flags & IOCB_NOWAIT) ||
- ((iocb->ki_flags & IOCB_WAITQ) && i)) {
- for (j = i; j < nr_got; j++)
- put_page(pages[j]);
- nr_got = i;
- err = -EAGAIN;
- break;
- }
+ filemap_get_read_batch(mapping, index, last_index, pvec);
+ if (!pagevec_count(pvec)) {
+ if (iocb->ki_flags & IOCB_NOIO)
+ return -EAGAIN;
+ page_cache_sync_readahead(mapping, ra, filp, index,
+ last_index - index);
+ filemap_get_read_batch(mapping, index, last_index, pvec);
+ }
+ if (!pagevec_count(pvec)) {
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
+ return -EAGAIN;
+ err = filemap_create_page(filp, mapping,
+ iocb->ki_pos >> PAGE_SHIFT, pvec);
+ if (err == AOP_TRUNCATED_PAGE)
+ goto retry;
+ return err;
+ }
- page = generic_file_buffered_read_pagenotuptodate(iocb,
- filp, iter, page, pg_pos, pg_count);
- if (IS_ERR_OR_NULL(page)) {
- for (j = i + 1; j < nr_got; j++)
- put_page(pages[j]);
- nr_got = i;
- err = PTR_ERR_OR_ZERO(page);
- break;
- }
- }
+ page = pvec->pages[pagevec_count(pvec) - 1];
+ if (PageReadahead(page)) {
+ err = filemap_readahead(iocb, filp, mapping, page, last_index);
+ if (err)
+ goto err;
+ }
+ if (!PageUptodate(page)) {
+ if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
+ iocb->ki_flags |= IOCB_NOWAIT;
+ err = filemap_update_page(iocb, mapping, iter, page);
+ if (err)
+ goto err;
}
- if (likely(nr_got))
- return nr_got;
- if (err)
- return err;
- /*
- * No pages and no error means we raced and should retry:
- */
- goto find_page;
+ return 0;
+err:
+ if (err < 0)
+ put_page(page);
+ if (likely(--pvec->nr))
+ return 0;
+ if (err == AOP_TRUNCATED_PAGE)
+ goto retry;
+ return err;
}
/**
- * generic_file_buffered_read - generic file read routine
- * @iocb: the iocb to read
- * @iter: data destination
- * @written: already copied
- *
- * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level stuff.
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
*
- * This is really ugly. But the goto's actually try to clarify some
- * of the logic when it comes to error handling etc.
+ * Copies data from the page cache. If the data is not currently present,
+ * uses the readahead and readpage address_space operations to fetch it.
*
- * Return:
- * * total number of bytes copied, including those the were already @written
- * * negative error code if nothing was copied
+ * Return: Total number of bytes copied, including those already read by
+ * the caller. If an error happens before any bytes are copied, returns
+ * a negative error number.
*/
-ssize_t generic_file_buffered_read(struct kiocb *iocb,
- struct iov_iter *iter, ssize_t written)
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+ ssize_t already_read)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
- struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
- unsigned int nr_pages = min_t(unsigned int, 512,
- ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
- (iocb->ki_pos >> PAGE_SHIFT));
- int i, pg_nr, error = 0;
+ struct pagevec pvec;
+ int i, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
@@ -2463,14 +2424,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
-
- if (nr_pages > ARRAY_SIZE(pages_onstack))
- pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
-
- if (!pages) {
- pages = pages_onstack;
- nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
- }
+ pagevec_init(&pvec);
do {
cond_resched();
@@ -2480,16 +2434,12 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
- if ((iocb->ki_flags & IOCB_WAITQ) && written)
+ if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
iocb->ki_flags |= IOCB_NOWAIT;
- i = 0;
- pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
- pages, nr_pages);
- if (pg_nr < 0) {
- error = pg_nr;
+ error = filemap_get_pages(iocb, iter, &pvec);
+ if (error < 0)
break;
- }
/*
* i_size must be checked after we know the pages are Uptodate.
@@ -2502,13 +2452,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
goto put_pages;
-
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
- while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
- (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
- put_page(pages[--pg_nr]);
-
/*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
@@ -2521,27 +2466,35 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
*/
if (iocb->ki_pos >> PAGE_SHIFT !=
ra->prev_pos >> PAGE_SHIFT)
- mark_page_accessed(pages[0]);
- for (i = 1; i < pg_nr; i++)
- mark_page_accessed(pages[i]);
+ mark_page_accessed(pvec.pages[0]);
- for (i = 0; i < pg_nr; i++) {
- unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
- unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
- PAGE_SIZE - offset);
- unsigned int copied;
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ size_t page_size = thp_size(page);
+ size_t offset = iocb->ki_pos & (page_size - 1);
+ size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+ page_size - offset);
+ size_t copied;
+ if (end_offset < page_offset(page))
+ break;
+ if (i > 0)
+ mark_page_accessed(page);
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (writably_mapped)
- flush_dcache_page(pages[i]);
+ if (writably_mapped) {
+ int j;
+
+ for (j = 0; j < thp_nr_pages(page); j++)
+ flush_dcache_page(page + j);
+ }
- copied = copy_page_to_iter(pages[i], offset, bytes, iter);
+ copied = copy_page_to_iter(page, offset, bytes, iter);
- written += copied;
+ already_read += copied;
iocb->ki_pos += copied;
ra->prev_pos = iocb->ki_pos;
@@ -2551,18 +2504,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
}
}
put_pages:
- for (i = 0; i < pg_nr; i++)
- put_page(pages[i]);
+ for (i = 0; i < pagevec_count(&pvec); i++)
+ put_page(pvec.pages[i]);
+ pagevec_reinit(&pvec);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
file_accessed(filp);
- if (pages != pages_onstack)
- kfree(pages);
-
- return written ? written : error;
+ return already_read ? already_read : error;
}
-EXPORT_SYMBOL_GPL(generic_file_buffered_read);
+EXPORT_SYMBOL_GPL(filemap_read);
/**
* generic_file_read_iter - generic filesystem read routine
@@ -2592,7 +2543,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t retval = 0;
if (!count)
- goto out; /* skip atime */
+ return 0; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct file *file = iocb->ki_filp;
@@ -2610,7 +2561,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos,
iocb->ki_pos + count - 1);
if (retval < 0)
- goto out;
+ return retval;
}
file_accessed(file);
@@ -2620,7 +2571,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos += retval;
count -= retval;
}
- iov_iter_revert(iter, count - iov_iter_count(iter));
+ if (retval != -EIOCBQUEUED)
+ iov_iter_revert(iter, count - iov_iter_count(iter));
/*
* Btrfs can have a short DIO read if we encounter
@@ -2633,12 +2585,10 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
*/
if (retval < 0 || !count || iocb->ki_pos >= size ||
IS_DAX(inode))
- goto out;
+ return retval;
}
- retval = generic_file_buffered_read(iocb, iter, retval);
-out:
- return retval;
+ return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);
@@ -3431,7 +3381,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
}
iocb->ki_pos = pos;
}
- iov_iter_revert(from, write_len - iov_iter_count(from));
+ if (written != -EIOCBQUEUED)
+ iov_iter_revert(from, write_len - iov_iter_count(from));
out:
return written;
}
diff --git a/mm/gup.c b/mm/gup.c
index e4c224cd9661..e40579624f10 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -78,9 +78,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
* considered failure, and furthermore, a likely bug in the caller, so a warning
* is also emitted.
*/
-static __maybe_unused struct page *try_grab_compound_head(struct page *page,
- int refs,
- unsigned int flags)
+__maybe_unused struct page *try_grab_compound_head(struct page *page,
+ int refs, unsigned int flags)
{
if (flags & FOLL_GET)
return try_get_compound_head(page, refs);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 91ca9b103ee5..d77605c30f2e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -386,7 +386,11 @@ static int __init hugepage_init(void)
struct kobject *hugepage_kobj;
if (!has_transparent_hugepage()) {
- transparent_hugepage_flags = 0;
+ /*
+ * Hardware doesn't support hugepages, hence disable
+ * DAX PMD support.
+ */
+ transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
return -EINVAL;
}
@@ -636,6 +640,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
@@ -690,20 +695,19 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
}
/* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
{
pmd_t entry;
if (!pmd_none(*pmd))
- return false;
+ return;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
if (pgtable)
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
- return true;
}
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
@@ -749,6 +753,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
haddr, vmf->pmd, zero_page);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
}
} else {
@@ -1439,7 +1444,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page);
+ put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
goto out;
}
@@ -1475,7 +1480,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page);
+ put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
goto out;
}
@@ -2176,7 +2181,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
lock_page_memcg(page);
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
- __dec_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS,
+ -HPAGE_PMD_NR);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2751,10 +2757,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
+ int nr = thp_nr_pages(head);
+
if (PageSwapBacked(head))
- __dec_lruvec_page_state(head, NR_SHMEM_THPS);
+ __mod_lruvec_page_state(head, NR_SHMEM_THPS,
+ -nr);
else
- __dec_lruvec_page_state(head, NR_FILE_THPS);
+ __mod_lruvec_page_state(head, NR_FILE_THPS,
+ -nr);
}
__split_huge_page(page, list, end);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 905a7d549b00..8fb42c6dd74b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -79,34 +79,29 @@ DEFINE_SPINLOCK(hugetlb_lock);
static int num_fault_mutexes;
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
-static inline bool PageHugeFreed(struct page *head)
-{
- return page_private(head + 4) == -1UL;
-}
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
-static inline void SetPageHugeFreed(struct page *head)
+static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
- set_page_private(head + 4, -1UL);
-}
+ if (spool->count)
+ return false;
+ if (spool->max_hpages != -1)
+ return spool->used_hpages == 0;
+ if (spool->min_hpages != -1)
+ return spool->rsv_hpages == spool->min_hpages;
-static inline void ClearPageHugeFreed(struct page *head)
-{
- set_page_private(head + 4, 0);
+ return true;
}
-/* Forward declaration */
-static int hugetlb_acct_memory(struct hstate *h, long delta);
-
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
- bool free = (spool->count == 0) && (spool->used_hpages == 0);
-
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
* remain, give up any reservations based on minimum size and
* free the subpool */
- if (free) {
+ if (subpool_is_free(spool)) {
if (spool->min_hpages != -1)
hugetlb_acct_memory(spool->hstate,
-spool->min_hpages);
@@ -1043,7 +1038,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
- SetPageHugeFreed(page);
+ SetHPageFreed(page);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -1060,7 +1055,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
- ClearPageHugeFreed(page);
+ ClearHPageFreed(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
return page;
@@ -1133,7 +1128,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
- SetPagePrivate(page);
+ SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
@@ -1224,8 +1219,7 @@ static void destroy_compound_gigantic_page(struct page *page,
struct page *p = page + 1;
atomic_set(compound_mapcount_ptr(page), 0);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(compound_pincount_ptr(page), 0);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
clear_compound_head(p);
@@ -1312,14 +1306,16 @@ static inline void destroy_compound_gigantic_page(struct page *page,
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
+ struct page *subpage = page;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
- for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ for (i = 0; i < pages_per_huge_page(h);
+ i++, subpage = mem_map_next(subpage, page, i)) {
+ subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
@@ -1353,52 +1349,6 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
-/*
- * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
- * to hstate->hugepage_activelist.)
- *
- * This function can be called for tail pages, but never returns true for them.
- */
-bool page_huge_active(struct page *page)
-{
- return PageHeadHuge(page) && PagePrivate(&page[1]);
-}
-
-/* never called for tail page */
-void set_page_huge_active(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
- SetPagePrivate(&page[1]);
-}
-
-static void clear_page_huge_active(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
- ClearPagePrivate(&page[1]);
-}
-
-/*
- * Internal hugetlb specific page flag. Do not use outside of the hugetlb
- * code
- */
-static inline bool PageHugeTemporary(struct page *page)
-{
- if (!PageHuge(page))
- return false;
-
- return (unsigned long)page[2].mapping == -1U;
-}
-
-static inline void SetPageHugeTemporary(struct page *page)
-{
- page[2].mapping = (void *)-1U;
-}
-
-static inline void ClearPageHugeTemporary(struct page *page)
-{
- page[2].mapping = NULL;
-}
-
static void __free_huge_page(struct page *page)
{
/*
@@ -1407,24 +1357,23 @@ static void __free_huge_page(struct page *page)
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
- struct hugepage_subpool *spool =
- (struct hugepage_subpool *)page_private(page);
+ struct hugepage_subpool *spool = hugetlb_page_subpool(page);
bool restore_reserve;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
- set_page_private(page, 0);
+ hugetlb_set_page_subpool(page, NULL);
page->mapping = NULL;
- restore_reserve = PagePrivate(page);
- ClearPagePrivate(page);
+ restore_reserve = HPageRestoreReserve(page);
+ ClearHPageRestoreReserve(page);
/*
- * If PagePrivate() was set on page, page allocation consumed a
+ * If HPageRestoreReserve was set on page, page allocation consumed a
* reservation. If the page was associated with a subpool, there
* would have been a page reserved in the subpool before allocation
* via hugepage_subpool_get_pages(). Since we are 'restoring' the
- * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * reservation, do not call hugepage_subpool_put_pages() as this will
* remove the reserved page from the subpool.
*/
if (!restore_reserve) {
@@ -1439,7 +1388,7 @@ static void __free_huge_page(struct page *page)
}
spin_lock(&hugetlb_lock);
- clear_page_huge_active(page);
+ ClearHPageMigratable(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
@@ -1447,9 +1396,9 @@ static void __free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
- if (PageHugeTemporary(page)) {
+ if (HPageTemporary(page)) {
list_del(&page->lru);
- ClearPageHugeTemporary(page);
+ ClearHPageTemporary(page);
update_and_free_page(h, page);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
@@ -1516,12 +1465,13 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ hugetlb_set_page_subpool(page, NULL);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
spin_lock(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
- ClearPageHugeFreed(page);
+ ClearHPageFreed(page);
spin_unlock(&hugetlb_lock);
}
@@ -1553,9 +1503,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
-
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(compound_pincount_ptr(page), 0);
}
/*
@@ -1794,7 +1742,7 @@ retry:
* We should make sure that the page is already on the free list
* when it is dissolved.
*/
- if (unlikely(!PageHugeFreed(head))) {
+ if (unlikely(!HPageFreed(head))) {
spin_unlock(&hugetlb_lock);
cond_resched();
@@ -1885,7 +1833,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
- SetPageHugeTemporary(page);
+ SetHPageTemporary(page);
spin_unlock(&hugetlb_lock);
put_page(page);
return NULL;
@@ -1916,7 +1864,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
*/
- SetPageHugeTemporary(page);
+ SetHPageTemporary(page);
return page;
}
@@ -2254,24 +2202,24 @@ static long vma_add_reservation(struct hstate *h,
* This routine is called to restore a reservation on error paths. In the
* specific error paths, a huge page was allocated (via alloc_huge_page)
* and is about to be freed. If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set PagePrivate
- * in the newly allocated page. When the page is freed via free_huge_page,
- * the global reservation count will be incremented if PagePrivate is set.
- * However, free_huge_page can not adjust the reserve map. Adjust the
- * reserve map here to be consistent with global reserve count adjustments
- * to be made by free_huge_page.
+ * alloc_huge_page would have consumed the reservation and set
+ * HPageRestoreReserve in the newly allocated page. When the page is freed
+ * via free_huge_page, the global reservation count will be incremented if
+ * HPageRestoreReserve is set. However, free_huge_page can not adjust the
+ * reserve map. Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page.
*/
static void restore_reserve_on_error(struct hstate *h,
struct vm_area_struct *vma, unsigned long address,
struct page *page)
{
- if (unlikely(PagePrivate(page))) {
+ if (unlikely(HPageRestoreReserve(page))) {
long rc = vma_needs_reservation(h, vma, address);
if (unlikely(rc < 0)) {
/*
* Rare out of memory condition in reserve map
- * manipulation. Clear PagePrivate so that
+ * manipulation. Clear HPageRestoreReserve so that
* global reserve count will not be incremented
* by free_huge_page. This will make it appear
* as though the reservation for this page was
@@ -2280,7 +2228,7 @@ static void restore_reserve_on_error(struct hstate *h,
* is better than inconsistent global huge page
* accounting of reserve counts.
*/
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
} else if (rc) {
rc = vma_add_reservation(h, vma, address);
if (unlikely(rc < 0))
@@ -2288,7 +2236,7 @@ static void restore_reserve_on_error(struct hstate *h,
* See above comment about rare out of
* memory condition.
*/
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
} else
vma_end_reservation(h, vma, address);
}
@@ -2369,7 +2317,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
if (!page)
goto out_uncharge_cgroup;
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
- SetPagePrivate(page);
+ SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
spin_lock(&hugetlb_lock);
@@ -2387,7 +2335,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
spin_unlock(&hugetlb_lock);
- set_page_private(page, (unsigned long)spool);
+ hugetlb_set_page_subpool(page, spool);
map_commit = vma_commit_reservation(h, vma, addr);
if (unlikely(map_chg > map_commit)) {
@@ -2476,7 +2424,7 @@ static void __init gather_bootmem_prealloc(void)
struct hstate *h = m->hstate;
WARN_ON(page_count(page) != 1);
- prep_compound_huge_page(page, h->order);
+ prep_compound_huge_page(page, huge_page_order(h));
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
put_page(page); /* free it into the hugepage allocator */
@@ -2488,7 +2436,7 @@ static void __init gather_bootmem_prealloc(void)
* side-effects, like CommitLimit going negative.
*/
if (hstate_is_gigantic(h))
- adjust_managed_page_count(page, 1 << h->order);
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
}
@@ -2520,7 +2468,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (hstate_is_gigantic(h)) {
if (hugetlb_cma_size) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
- break;
+ goto free;
}
if (!alloc_bootmem_huge_page(h))
break;
@@ -2538,7 +2486,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
-
+free:
kfree(node_alloc_noretry);
}
@@ -2988,8 +2936,10 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
return -ENOMEM;
retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
- if (retval)
+ if (retval) {
kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ }
return retval;
}
@@ -3159,6 +3109,9 @@ static int __init hugetlb_init(void)
{
int i;
+ BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
+ __NR_HPAGEFLAGS);
+
if (!hugepages_supported()) {
if (hugetlb_max_hstate || default_hstate_max_huge_pages)
pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
@@ -3239,7 +3192,7 @@ void __init hugetlb_add_hstate(unsigned int order)
BUG_ON(order == 0);
h = &hstates[hugetlb_max_hstate++];
h->order = order;
- h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+ h->mask = ~(huge_page_size(h) - 1);
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
@@ -3408,8 +3361,7 @@ static unsigned int allowed_mems_nr(struct hstate *h)
mpol_allowed = policy_nodemask_current(gfp_mask);
for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mpol_allowed ||
- (mpol_allowed && node_isset(node, *mpol_allowed)))
+ if (!mpol_allowed || node_isset(node, *mpol_allowed))
nr += array[node];
}
@@ -3515,7 +3467,7 @@ void hugetlb_report_meminfo(struct seq_file *m)
for_each_hstate(h) {
unsigned long count = h->nr_huge_pages;
- total += (PAGE_SIZE << huge_page_order(h)) * count;
+ total += huge_page_size(h) * count;
if (h == &default_hstate)
seq_printf(m,
@@ -3528,10 +3480,10 @@ void hugetlb_report_meminfo(struct seq_file *m)
h->free_huge_pages,
h->resv_huge_pages,
h->surplus_huge_pages,
- (PAGE_SIZE << huge_page_order(h)) / 1024);
+ huge_page_size(h) / SZ_1K);
}
- seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
+ seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
}
int hugetlb_report_node_meminfo(char *buf, int len, int nid)
@@ -3565,7 +3517,7 @@ void hugetlb_show_meminfo(void)
h->nr_huge_pages_node[nid],
h->free_huge_pages_node[nid],
h->surplus_huge_pages_node[nid],
- 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+ huge_page_size(h) / SZ_1K);
}
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
@@ -3589,6 +3541,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
{
int ret = -ENOMEM;
+ if (!delta)
+ return 0;
+
spin_lock(&hugetlb_lock);
/*
* When cpuset is configured, it breaks the strict hugetlb page
@@ -3685,15 +3640,13 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
- struct hstate *hstate = hstate_vma(vma);
-
- return 1UL << huge_page_shift(hstate);
+ return huge_page_size(hstate_vma(vma));
}
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
+ * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
@@ -4017,7 +3970,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
/*
* This is called when the original mapper is failing to COW a MAP_PRIVATE
- * mappping it owns the reserve page for. The intention is to unmap the page
+ * mapping it owns the reserve page for. The intention is to unmap the page
* from other VMAs and let the children be SIGKILLed if they are faulting the
* same region.
*/
@@ -4196,7 +4149,7 @@ retry_avoidcopy:
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
- ClearPagePrivate(new_page);
+ ClearHPageRestoreReserve(new_page);
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
@@ -4205,7 +4158,7 @@ retry_avoidcopy:
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
- set_page_huge_active(new_page);
+ SetHPageMigratable(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
@@ -4263,7 +4216,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
if (err)
return err;
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
/*
* set page dirty so that it will not be removed from cache/file
@@ -4425,7 +4378,7 @@ retry:
goto backout;
if (anon_rmap) {
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, vma, haddr);
} else
page_dup_rmap(page, true);
@@ -4442,12 +4395,12 @@ retry:
spin_unlock(ptl);
/*
- * Only make newly allocated pages active. Existing pages found
- * in the pagecache could be !page_huge_active() if they have been
- * isolated for migration.
+ * Only set HPageMigratable in newly allocated pages. Existing pages
+ * found in the pagecache may not have HPageMigratableset if they have
+ * been isolated for migration.
*/
if (new_page)
- set_page_huge_active(page);
+ SetHPageMigratable(page);
unlock_page(page);
out:
@@ -4477,7 +4430,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
}
#else
/*
- * For uniprocesor systems we always use a single mutex, so just
+ * For uniprocessor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
@@ -4739,7 +4692,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (vm_shared) {
page_dup_rmap(page, true);
} else {
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}
@@ -4758,7 +4711,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
- set_page_huge_active(page);
+ SetHPageMigratable(page);
if (vm_shared)
unlock_page(page);
ret = 0;
@@ -4773,6 +4726,20 @@ out_release_nounlock:
goto out;
}
+static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
+ int refs, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ int nr;
+
+ for (nr = 0; nr < refs; nr++) {
+ if (likely(pages))
+ pages[nr] = mem_map_offset(page, nr);
+ if (vmas)
+ vmas[nr] = vma;
+ }
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -4782,7 +4749,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vaddr = *position;
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
- int err = -EFAULT;
+ int err = -EFAULT, refs;
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
@@ -4902,20 +4869,29 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
continue;
}
-same_page:
+ refs = min3(pages_per_huge_page(h) - pfn_offset,
+ (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
+
+ if (pages || vmas)
+ record_subpages_vmas(mem_map_offset(page, pfn_offset),
+ vma, refs,
+ likely(pages) ? pages + i : NULL,
+ vmas ? vmas + i : NULL);
+
if (pages) {
- pages[i] = mem_map_offset(page, pfn_offset);
/*
- * try_grab_page() should always succeed here, because:
- * a) we hold the ptl lock, and b) we've just checked
- * that the huge page is present in the page tables. If
- * the huge page is present, then the tail pages must
- * also be present. The ptl prevents the head page and
- * tail pages from being rearranged in any way. So this
- * page must be available at this point, unless the page
- * refcount overflowed:
+ * try_grab_compound_head() should always succeed here,
+ * because: a) we hold the ptl lock, and b) we've just
+ * checked that the huge page is present in the page
+ * tables. If the huge page is present, then the tail
+ * pages must also be present. The ptl prevents the
+ * head page and tail pages from being rearranged in
+ * any way. So this page must be available at this
+ * point, unless the page refcount overflowed:
*/
- if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+ if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
+ refs,
+ flags))) {
spin_unlock(ptl);
remainder = 0;
err = -ENOMEM;
@@ -4923,21 +4899,10 @@ same_page:
}
}
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- ++pfn_offset;
- --remainder;
- ++i;
- if (vaddr < vma->vm_end && remainder &&
- pfn_offset < pages_per_huge_page(h)) {
- /*
- * We use pfn_offset to avoid touching the pageframes
- * of this compound page.
- */
- goto same_page;
- }
+ vaddr += (refs << PAGE_SHIFT);
+ remainder -= refs;
+ i += refs;
+
spin_unlock(ptl);
}
*nr_pages = remainder;
@@ -5051,12 +5016,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
return pages << h->order;
}
-int hugetlb_reserve_pages(struct inode *inode,
+/* Return true if reservation was successful, false otherwise. */
+bool hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long ret, chg, add = -1;
+ long chg, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -5066,7 +5032,7 @@ int hugetlb_reserve_pages(struct inode *inode,
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
- return -EINVAL;
+ return false;
}
/*
@@ -5075,7 +5041,7 @@ int hugetlb_reserve_pages(struct inode *inode,
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
- return 0;
+ return true;
/*
* Shared mappings base their reservation on the number of pages that
@@ -5097,7 +5063,7 @@ int hugetlb_reserve_pages(struct inode *inode,
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
- return -ENOMEM;
+ return false;
chg = to - from;
@@ -5105,18 +5071,12 @@ int hugetlb_reserve_pages(struct inode *inode,
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
- if (chg < 0) {
- ret = chg;
+ if (chg < 0)
goto out_err;
- }
- ret = hugetlb_cgroup_charge_cgroup_rsvd(
- hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
-
- if (ret < 0) {
- ret = -ENOMEM;
+ if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
+ chg * pages_per_huge_page(h), &h_cg) < 0)
goto out_err;
- }
if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
/* For private mappings, the hugetlb_cgroup uncharge info hangs
@@ -5131,19 +5091,15 @@ int hugetlb_reserve_pages(struct inode *inode,
* reservations already in place (gbl_reserve).
*/
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
- if (gbl_reserve < 0) {
- ret = -ENOSPC;
+ if (gbl_reserve < 0)
goto out_uncharge_cgroup;
- }
/*
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
- ret = hugetlb_acct_memory(h, gbl_reserve);
- if (ret < 0) {
+ if (hugetlb_acct_memory(h, gbl_reserve) < 0)
goto out_put_pages;
- }
/*
* Account for the reservations made. Shared mappings record regions
@@ -5161,7 +5117,6 @@ int hugetlb_reserve_pages(struct inode *inode,
if (unlikely(add < 0)) {
hugetlb_acct_memory(h, -gbl_reserve);
- ret = add;
goto out_put_pages;
} else if (unlikely(chg > add)) {
/*
@@ -5182,7 +5137,8 @@ int hugetlb_reserve_pages(struct inode *inode,
hugetlb_acct_memory(h, -rsv_adjust);
}
}
- return 0;
+ return true;
+
out_put_pages:
/* put back original number of pages, chg */
(void)hugepage_subpool_put_pages(spool, chg);
@@ -5198,7 +5154,7 @@ out_err:
region_abort(resv_map, from, to, regions_needed);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
- return ret;
+ return false;
}
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
@@ -5259,7 +5215,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
- sbase < svma->vm_start || svma->vm_end < s_end)
+ !range_in_vma(svma, sbase, s_end))
return 0;
return saddr;
@@ -5286,21 +5242,23 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long a_start, a_end;
+ unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
+ v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
- if (!(vma->vm_flags & VM_MAYSHARE))
+ /*
+ * vma need span at least one aligned PUD size and the start,end range
+ * must at least partialy within it.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
+ (*end <= v_start) || (*start >= v_end))
return;
/* Extend the range to be PUD aligned for a worst case scenario */
- a_start = ALIGN_DOWN(*start, PUD_SIZE);
- a_end = ALIGN(*end, PUD_SIZE);
+ if (*start > v_start)
+ *start = ALIGN_DOWN(*start, PUD_SIZE);
- /*
- * Intersect the range with the vma range, since pmd sharing won't be
- * across vma after all
- */
- *start = max(vma->vm_start, a_start);
- *end = min(vma->vm_end, a_end);
+ if (*end < v_end)
+ *end = ALIGN(*end, PUD_SIZE);
}
/*
@@ -5583,12 +5541,13 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
bool ret = true;
spin_lock(&hugetlb_lock);
- if (!PageHeadHuge(page) || !page_huge_active(page) ||
+ if (!PageHeadHuge(page) ||
+ !HPageMigratable(page) ||
!get_page_unless_zero(page)) {
ret = false;
goto unlock;
}
- clear_page_huge_active(page);
+ ClearHPageMigratable(page);
list_move_tail(&page->lru, list);
unlock:
spin_unlock(&hugetlb_lock);
@@ -5597,9 +5556,8 @@ unlock:
void putback_active_hugepage(struct page *page)
{
- VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
- set_page_huge_active(page);
+ SetHPageMigratable(page);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock(&hugetlb_lock);
put_page(page);
@@ -5622,12 +5580,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
* here as well otherwise the global surplus count will not match
* the per-node's.
*/
- if (PageHugeTemporary(newpage)) {
+ if (HPageTemporary(newpage)) {
int old_nid = page_to_nid(oldpage);
int new_nid = page_to_nid(newpage);
- SetPageHugeTemporary(oldpage);
- ClearPageHugeTemporary(newpage);
+ SetHPageTemporary(oldpage);
+ ClearHPageTemporary(newpage);
spin_lock(&hugetlb_lock);
if (h->surplus_huge_pages_node[old_nid]) {
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 9182848dda3e..f68b51fcda3d 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -113,7 +113,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
rsvd_parent);
limit = round_down(PAGE_COUNTER_MAX,
- 1 << huge_page_order(&hstates[idx]));
+ pages_per_huge_page(&hstates[idx]));
ret = page_counter_set_max(
hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
@@ -460,7 +460,7 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
counter = &h_cg->hugepage[idx];
limit = round_down(PAGE_COUNTER_MAX,
- 1 << huge_page_order(&hstates[idx]));
+ pages_per_huge_page(&hstates[idx]));
switch (MEMFILE_ATTR(cft->private)) {
case RES_RSVD_USAGE:
@@ -507,7 +507,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return ret;
idx = MEMFILE_IDX(of_cft(of)->private);
- nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
+ nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_RSVD_LIMIT:
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b25167664ead..b18189ef3a92 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -60,7 +60,7 @@ void kasan_disable_current(void)
void __kasan_unpoison_range(const void *address, size_t size)
{
- unpoison_range(address, size);
+ kasan_unpoison(address, size);
}
#if CONFIG_KASAN_STACK
@@ -69,7 +69,7 @@ void kasan_unpoison_task_stack(struct task_struct *task)
{
void *base = task_stack_page(task);
- unpoison_range(base, THREAD_SIZE);
+ kasan_unpoison(base, THREAD_SIZE);
}
/* Unpoison the stack for the current task beyond a watermark sp value. */
@@ -82,7 +82,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
*/
void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
- unpoison_range(base, watermark - base);
+ kasan_unpoison(base, watermark - base);
}
#endif /* CONFIG_KASAN_STACK */
@@ -105,18 +105,17 @@ void __kasan_alloc_pages(struct page *page, unsigned int order)
if (unlikely(PageHighMem(page)))
return;
- tag = random_tag();
+ tag = kasan_random_tag();
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- unpoison_range(page_address(page), PAGE_SIZE << order);
+ kasan_unpoison(page_address(page), PAGE_SIZE << order);
}
void __kasan_free_pages(struct page *page, unsigned int order)
{
if (likely(!PageHighMem(page)))
- poison_range(page_address(page),
- PAGE_SIZE << order,
- KASAN_FREE_PAGE);
+ kasan_poison(page_address(page), PAGE_SIZE << order,
+ KASAN_FREE_PAGE);
}
/*
@@ -246,18 +245,18 @@ void __kasan_poison_slab(struct page *page)
for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
- poison_range(page_address(page), page_size(page),
+ kasan_poison(page_address(page), page_size(page),
KASAN_KMALLOC_REDZONE);
}
void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
{
- unpoison_range(object, cache->object_size);
+ kasan_unpoison(object, cache->object_size);
}
void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
- poison_range(object, cache->object_size, KASAN_KMALLOC_REDZONE);
+ kasan_poison(object, cache->object_size, KASAN_KMALLOC_REDZONE);
}
/*
@@ -294,7 +293,7 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object,
* set, assign a tag when the object is being allocated (init == false).
*/
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
- return init ? KASAN_TAG_KERNEL : random_tag();
+ return init ? KASAN_TAG_KERNEL : kasan_random_tag();
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
@@ -305,7 +304,7 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object,
* For SLUB assign a random tag during slab creation, otherwise reuse
* the already assigned tag.
*/
- return init ? random_tag() : get_tag(object);
+ return init ? kasan_random_tag() : get_tag(object);
#endif
}
@@ -346,12 +345,12 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
return false;
- if (check_invalid_free(tagged_object)) {
+ if (!kasan_byte_accessible(tagged_object)) {
kasan_report_invalid_free(tagged_object, ip);
return true;
}
- poison_range(object, cache->object_size, KASAN_KMALLOC_FREE);
+ kasan_poison(object, cache->object_size, KASAN_KMALLOC_FREE);
if (!kasan_stack_collection_enabled())
return false;
@@ -361,7 +360,7 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
kasan_set_free_info(cache, object, tag);
- return quarantine_put(cache, object);
+ return kasan_quarantine_put(cache, object);
}
bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
@@ -386,7 +385,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
kasan_report_invalid_free(ptr, ip);
return;
}
- poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
+ kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE);
} else {
____kasan_slab_free(page->slab_cache, ptr, ip, false);
}
@@ -409,7 +408,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
u8 tag;
if (gfpflags_allow_blocking(flags))
- quarantine_reduce();
+ kasan_quarantine_reduce();
if (unlikely(object == NULL))
return NULL;
@@ -421,9 +420,9 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
tag = assign_tag(cache, object, false, keep_tag);
/* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */
- unpoison_range(set_tag(object, tag), size);
- poison_range((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE);
+ kasan_unpoison(set_tag(object, tag), size);
+ kasan_poison((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_KMALLOC_REDZONE);
if (kasan_stack_collection_enabled())
set_alloc_info(cache, (void *)object, flags);
@@ -452,7 +451,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
unsigned long redzone_end;
if (gfpflags_allow_blocking(flags))
- quarantine_reduce();
+ kasan_quarantine_reduce();
if (unlikely(ptr == NULL))
return NULL;
@@ -462,8 +461,8 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
KASAN_GRANULE_SIZE);
redzone_end = (unsigned long)ptr + page_size(page);
- unpoison_range(ptr, size);
- poison_range((void *)redzone_start, redzone_end - redzone_start,
+ kasan_unpoison(ptr, size);
+ kasan_poison((void *)redzone_start, redzone_end - redzone_start,
KASAN_PAGE_REDZONE);
return (void *)ptr;
@@ -491,3 +490,12 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)
kasan_report_invalid_free(ptr, ip);
/* The object will be poisoned by kasan_free_pages(). */
}
+
+bool __kasan_check_byte(const void *address, unsigned long ip)
+{
+ if (!kasan_byte_accessible(address)) {
+ kasan_report((unsigned long)address, 1, false, ip);
+ return false;
+ }
+ return true;
+}
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 5106b84b07d4..3f17a1218055 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -158,7 +158,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
return memory_is_poisoned_n(addr, size);
}
-static __always_inline bool check_memory_region_inline(unsigned long addr,
+static __always_inline bool check_region_inline(unsigned long addr,
size_t size, bool write,
unsigned long ret_ip)
{
@@ -179,37 +179,37 @@ static __always_inline bool check_memory_region_inline(unsigned long addr,
return !kasan_report(addr, size, write, ret_ip);
}
-bool check_memory_region(unsigned long addr, size_t size, bool write,
- unsigned long ret_ip)
+bool kasan_check_range(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip)
{
- return check_memory_region_inline(addr, size, write, ret_ip);
+ return check_region_inline(addr, size, write, ret_ip);
}
-bool check_invalid_free(void *addr)
+bool kasan_byte_accessible(const void *addr)
{
s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
- return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE;
+ return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE;
}
void kasan_cache_shrink(struct kmem_cache *cache)
{
- quarantine_remove_cache(cache);
+ kasan_quarantine_remove_cache(cache);
}
void kasan_cache_shutdown(struct kmem_cache *cache)
{
if (!__kmem_cache_empty(cache))
- quarantine_remove_cache(cache);
+ kasan_quarantine_remove_cache(cache);
}
static void register_global(struct kasan_global *global)
{
size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE);
- unpoison_range(global->beg, global->size);
+ kasan_unpoison(global->beg, global->size);
- poison_range(global->beg + aligned_size,
+ kasan_poison(global->beg + aligned_size,
global->size_with_redzone - aligned_size,
KASAN_GLOBAL_REDZONE);
}
@@ -231,7 +231,7 @@ EXPORT_SYMBOL(__asan_unregister_globals);
#define DEFINE_ASAN_LOAD_STORE(size) \
void __asan_load##size(unsigned long addr) \
{ \
- check_memory_region_inline(addr, size, false, _RET_IP_);\
+ check_region_inline(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_load##size); \
__alias(__asan_load##size) \
@@ -239,7 +239,7 @@ EXPORT_SYMBOL(__asan_unregister_globals);
EXPORT_SYMBOL(__asan_load##size##_noabort); \
void __asan_store##size(unsigned long addr) \
{ \
- check_memory_region_inline(addr, size, true, _RET_IP_); \
+ check_region_inline(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_store##size); \
__alias(__asan_store##size) \
@@ -254,7 +254,7 @@ DEFINE_ASAN_LOAD_STORE(16);
void __asan_loadN(unsigned long addr, size_t size)
{
- check_memory_region(addr, size, false, _RET_IP_);
+ kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_loadN);
@@ -264,7 +264,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort);
void __asan_storeN(unsigned long addr, size_t size)
{
- check_memory_region(addr, size, true, _RET_IP_);
+ kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__asan_storeN);
@@ -290,11 +290,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
- unpoison_range((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
- poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+ kasan_unpoison((const void *)(addr + rounded_down_size),
+ size - rounded_down_size);
+ kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
KASAN_ALLOCA_LEFT);
- poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+ kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
KASAN_ALLOCA_RIGHT);
}
EXPORT_SYMBOL(__asan_alloca_poison);
@@ -305,7 +305,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
if (unlikely(!stack_top || stack_top > stack_bottom))
return;
- unpoison_range(stack_top, stack_bottom - stack_top);
+ kasan_unpoison(stack_top, stack_bottom - stack_top);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index d558799b25b3..b31aeef505dd 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -185,3 +185,19 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
return &alloc_meta->free_track[0];
}
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+void kasan_set_tagging_report_once(bool state)
+{
+ hw_set_tagging_report_once(state);
+}
+EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once);
+
+void kasan_enable_tagging(void)
+{
+ hw_enable_tagging();
+}
+EXPORT_SYMBOL_GPL(kasan_enable_tagging);
+
+#endif
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8c706e7652f2..cc14b6e6c14c 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -36,6 +36,12 @@ extern bool kasan_flag_panic __ro_after_init;
#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
+#ifdef CONFIG_KASAN_HW_TAGS
+#define KASAN_TAG_MIN 0xF0 /* mimimum value for random tags */
+#else
+#define KASAN_TAG_MIN 0x00 /* mimimum value for random tags */
+#endif
+
#ifdef CONFIG_KASAN_GENERIC
#define KASAN_FREE_PAGE 0xFF /* page was freed */
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
@@ -195,14 +201,14 @@ static inline bool addr_has_metadata(const void *addr)
}
/**
- * check_memory_region - Check memory region, and report if invalid access.
+ * kasan_check_range - Check memory region, and report if invalid access.
* @addr: the accessed address
* @size: the accessed size
* @write: true if access is a write access
* @ret_ip: return address
* @return: true if access was valid, false if invalid
*/
-bool check_memory_region(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(unsigned long addr, size_t size, bool write,
unsigned long ret_ip);
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
@@ -215,19 +221,19 @@ static inline bool addr_has_metadata(const void *addr)
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
-void print_tags(u8 addr_tag, const void *addr);
+void kasan_print_tags(u8 addr_tag, const void *addr);
#else
-static inline void print_tags(u8 addr_tag, const void *addr) { }
+static inline void kasan_print_tags(u8 addr_tag, const void *addr) { }
#endif
-void *find_first_bad_addr(void *addr, size_t size);
-const char *get_bug_type(struct kasan_access_info *info);
-void metadata_fetch_row(char *buffer, void *row);
+void *kasan_find_first_bad_addr(void *addr, size_t size);
+const char *kasan_get_bug_type(struct kasan_access_info *info);
+void kasan_metadata_fetch_row(char *buffer, void *row);
#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK
-void print_address_stack_frame(const void *addr);
+void kasan_print_address_stack_frame(const void *addr);
#else
-static inline void print_address_stack_frame(const void *addr) { }
+static inline void kasan_print_address_stack_frame(const void *addr) { }
#endif
bool kasan_report(unsigned long addr, size_t size,
@@ -244,13 +250,13 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
-bool quarantine_put(struct kmem_cache *cache, void *object);
-void quarantine_reduce(void);
-void quarantine_remove_cache(struct kmem_cache *cache);
+bool kasan_quarantine_put(struct kmem_cache *cache, void *object);
+void kasan_quarantine_reduce(void);
+void kasan_quarantine_remove_cache(struct kmem_cache *cache);
#else
-static inline bool quarantine_put(struct kmem_cache *cache, void *object) { return false; }
-static inline void quarantine_reduce(void) { }
-static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+static inline bool kasan_quarantine_put(struct kmem_cache *cache, void *object) { return false; }
+static inline void kasan_quarantine_reduce(void) { }
+static inline void kasan_quarantine_remove_cache(struct kmem_cache *cache) { }
#endif
#ifndef arch_kasan_set_tag
@@ -274,6 +280,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifndef arch_init_tags
#define arch_init_tags(max_tag)
#endif
+#ifndef arch_set_tagging_report_once
+#define arch_set_tagging_report_once(state)
+#endif
#ifndef arch_get_random_tag
#define arch_get_random_tag() (0xFF)
#endif
@@ -286,48 +295,66 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_enable_tagging() arch_enable_tagging()
#define hw_init_tags(max_tag) arch_init_tags(max_tag)
+#define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state)
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag))
+#else /* CONFIG_KASAN_HW_TAGS */
+
+#define hw_enable_tagging()
+#define hw_set_tagging_report_once(state)
+
#endif /* CONFIG_KASAN_HW_TAGS */
+#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+void kasan_set_tagging_report_once(bool state);
+void kasan_enable_tagging(void);
+
+#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
+
+static inline void kasan_set_tagging_report_once(bool state) { }
+static inline void kasan_enable_tagging(void) { }
+
+#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
+
#ifdef CONFIG_KASAN_SW_TAGS
-u8 random_tag(void);
+u8 kasan_random_tag(void);
#elif defined(CONFIG_KASAN_HW_TAGS)
-static inline u8 random_tag(void) { return hw_get_random_tag(); }
+static inline u8 kasan_random_tag(void) { return hw_get_random_tag(); }
#else
-static inline u8 random_tag(void) { return 0; }
+static inline u8 kasan_random_tag(void) { return 0; }
#endif
#ifdef CONFIG_KASAN_HW_TAGS
-static inline void poison_range(const void *address, size_t size, u8 value)
+static inline void kasan_poison(const void *address, size_t size, u8 value)
{
hw_set_mem_tag_range(kasan_reset_tag(address),
round_up(size, KASAN_GRANULE_SIZE), value);
}
-static inline void unpoison_range(const void *address, size_t size)
+static inline void kasan_unpoison(const void *address, size_t size)
{
hw_set_mem_tag_range(kasan_reset_tag(address),
round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
}
-static inline bool check_invalid_free(void *addr)
+static inline bool kasan_byte_accessible(const void *addr)
{
u8 ptr_tag = get_tag(addr);
- u8 mem_tag = hw_get_mem_tag(addr);
+ u8 mem_tag = hw_get_mem_tag((void *)addr);
- return (mem_tag == KASAN_TAG_INVALID) ||
- (ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag);
+ return (mem_tag != KASAN_TAG_INVALID) &&
+ (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag);
}
#else /* CONFIG_KASAN_HW_TAGS */
-void poison_range(const void *address, size_t size, u8 value);
-void unpoison_range(const void *address, size_t size);
-bool check_invalid_free(void *addr);
+void kasan_poison(const void *address, size_t size, u8 value);
+void kasan_unpoison(const void *address, size_t size);
+bool kasan_byte_accessible(const void *addr);
#endif /* CONFIG_KASAN_HW_TAGS */
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 55783125a767..728fb24c5683 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -168,7 +168,7 @@ static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
qlist_init(q);
}
-bool quarantine_put(struct kmem_cache *cache, void *object)
+bool kasan_quarantine_put(struct kmem_cache *cache, void *object)
{
unsigned long flags;
struct qlist_head *q;
@@ -184,11 +184,11 @@ bool quarantine_put(struct kmem_cache *cache, void *object)
/*
* Note: irq must be disabled until after we move the batch to the
- * global quarantine. Otherwise quarantine_remove_cache() can miss
- * some objects belonging to the cache if they are in our local temp
- * list. quarantine_remove_cache() executes on_each_cpu() at the
- * beginning which ensures that it either sees the objects in per-cpu
- * lists or in the global quarantine.
+ * global quarantine. Otherwise kasan_quarantine_remove_cache() can
+ * miss some objects belonging to the cache if they are in our local
+ * temp list. kasan_quarantine_remove_cache() executes on_each_cpu()
+ * at the beginning which ensures that it either sees the objects in
+ * per-cpu lists or in the global quarantine.
*/
local_irq_save(flags);
@@ -222,7 +222,7 @@ bool quarantine_put(struct kmem_cache *cache, void *object)
return true;
}
-void quarantine_reduce(void)
+void kasan_quarantine_reduce(void)
{
size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
@@ -234,7 +234,7 @@ void quarantine_reduce(void)
return;
/*
- * srcu critical section ensures that quarantine_remove_cache()
+ * srcu critical section ensures that kasan_quarantine_remove_cache()
* will not miss objects belonging to the cache while they are in our
* local to_free list. srcu is chosen because (1) it gives us private
* grace period domain that does not interfere with anything else,
@@ -309,15 +309,15 @@ static void per_cpu_remove_cache(void *arg)
}
/* Free all quarantined objects belonging to cache. */
-void quarantine_remove_cache(struct kmem_cache *cache)
+void kasan_quarantine_remove_cache(struct kmem_cache *cache)
{
unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
/*
* Must be careful to not miss any objects that are being moved from
- * per-cpu list to the global quarantine in quarantine_put(),
- * nor objects being freed in quarantine_reduce(). on_each_cpu()
+ * per-cpu list to the global quarantine in kasan_quarantine_put(),
+ * nor objects being freed in kasan_quarantine_reduce(). on_each_cpu()
* achieves the first goal, while synchronize_srcu() achieves the
* second.
*/
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index c0fb21797550..234f35a84f19 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -61,7 +61,7 @@ __setup("kasan_multi_shot", kasan_set_multi_shot);
static void print_error_description(struct kasan_access_info *info)
{
pr_err("BUG: KASAN: %s in %pS\n",
- get_bug_type(info), (void *)info->ip);
+ kasan_get_bug_type(info), (void *)info->ip);
if (info->access_size)
pr_err("%s of size %zu at addr %px by task %s/%d\n",
info->is_write ? "Write" : "Read", info->access_size,
@@ -247,7 +247,7 @@ static void print_address_description(void *addr, u8 tag)
dump_page(page, "kasan: bad access detected");
}
- print_address_stack_frame(addr);
+ kasan_print_address_stack_frame(addr);
}
static bool meta_row_is_guilty(const void *row, const void *addr)
@@ -293,7 +293,7 @@ static void print_memory_metadata(const void *addr)
* function, because generic functions may try to
* access kasan mapping for the passed address.
*/
- metadata_fetch_row(&metadata[0], row);
+ kasan_metadata_fetch_row(&metadata[0], row);
print_hex_dump(KERN_ERR, buffer,
DUMP_PREFIX_NONE, META_BYTES_PER_ROW, 1,
@@ -331,7 +331,7 @@ static void kasan_update_kunit_status(struct kunit *cur_test)
}
kasan_data = (struct kunit_kasan_expectation *)resource->data;
- kasan_data->report_found = true;
+ WRITE_ONCE(kasan_data->report_found, true);
kunit_put_resource(resource);
}
#endif /* IS_ENABLED(CONFIG_KUNIT) */
@@ -350,7 +350,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
start_report(&flags);
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
- print_tags(tag, object);
+ kasan_print_tags(tag, object);
pr_err("\n");
print_address_description(object, tag);
pr_err("\n");
@@ -378,7 +378,8 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
info.access_addr = tagged_addr;
if (addr_has_metadata(untagged_addr))
- info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
+ info.first_bad_addr =
+ kasan_find_first_bad_addr(tagged_addr, size);
else
info.first_bad_addr = untagged_addr;
info.access_size = size;
@@ -389,7 +390,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
print_error_description(&info);
if (addr_has_metadata(untagged_addr))
- print_tags(get_tag(tagged_addr), info.first_bad_addr);
+ kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr);
pr_err("\n");
if (addr_has_metadata(untagged_addr)) {
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 8a9c889872da..41f374585144 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -30,7 +30,7 @@
#include "kasan.h"
#include "../slab.h"
-void *find_first_bad_addr(void *addr, size_t size)
+void *kasan_find_first_bad_addr(void *addr, size_t size)
{
void *p = addr;
@@ -105,7 +105,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info)
return bug_type;
}
-const char *get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_access_info *info)
{
/*
* If access_size is a negative number, then it has reason to be
@@ -123,7 +123,7 @@ const char *get_bug_type(struct kasan_access_info *info)
return get_wild_bug_type(info);
}
-void metadata_fetch_row(char *buffer, void *row)
+void kasan_metadata_fetch_row(char *buffer, void *row)
{
memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
}
@@ -263,7 +263,7 @@ static bool __must_check get_address_stack_frame_info(const void *addr,
return true;
}
-void print_address_stack_frame(const void *addr)
+void kasan_print_address_stack_frame(const void *addr)
{
unsigned long offset;
const char *frame_descr;
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index 57114f0e14d1..42b2168755d6 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -15,17 +15,17 @@
#include "kasan.h"
-const char *get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_access_info *info)
{
return "invalid-access";
}
-void *find_first_bad_addr(void *addr, size_t size)
+void *kasan_find_first_bad_addr(void *addr, size_t size)
{
return kasan_reset_tag(addr);
}
-void metadata_fetch_row(char *buffer, void *row)
+void kasan_metadata_fetch_row(char *buffer, void *row)
{
int i;
@@ -33,7 +33,7 @@ void metadata_fetch_row(char *buffer, void *row)
buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE);
}
-void print_tags(u8 addr_tag, const void *addr)
+void kasan_print_tags(u8 addr_tag, const void *addr)
{
u8 memory_tag = hw_get_mem_tag((void *)addr);
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index 1b026793ad57..3d20d3451d9e 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -29,7 +29,7 @@
#include "kasan.h"
#include "../slab.h"
-const char *get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_access_info *info)
{
#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
struct kasan_alloc_meta *alloc_meta;
@@ -72,7 +72,7 @@ const char *get_bug_type(struct kasan_access_info *info)
return "invalid-access";
}
-void *find_first_bad_addr(void *addr, size_t size)
+void *kasan_find_first_bad_addr(void *addr, size_t size)
{
u8 tag = get_tag(addr);
void *p = kasan_reset_tag(addr);
@@ -83,12 +83,12 @@ void *find_first_bad_addr(void *addr, size_t size)
return p;
}
-void metadata_fetch_row(char *buffer, void *row)
+void kasan_metadata_fetch_row(char *buffer, void *row)
{
memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
}
-void print_tags(u8 addr_tag, const void *addr)
+void kasan_print_tags(u8 addr_tag, const void *addr)
{
u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 7c2c08c55f32..80adc85d0393 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -27,20 +27,20 @@
bool __kasan_check_read(const volatile void *p, unsigned int size)
{
- return check_memory_region((unsigned long)p, size, false, _RET_IP_);
+ return kasan_check_range((unsigned long)p, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_read);
bool __kasan_check_write(const volatile void *p, unsigned int size)
{
- return check_memory_region((unsigned long)p, size, true, _RET_IP_);
+ return kasan_check_range((unsigned long)p, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_write);
#undef memset
void *memset(void *addr, int c, size_t len)
{
- if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_))
+ if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
return NULL;
return __memset(addr, c, len);
@@ -50,8 +50,8 @@ void *memset(void *addr, int c, size_t len)
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+ !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
return NULL;
return __memmove(dest, src, len);
@@ -61,8 +61,8 @@ void *memmove(void *dest, const void *src, size_t len)
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+ !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
return NULL;
return __memcpy(dest, src, len);
@@ -72,7 +72,7 @@ void *memcpy(void *dest, const void *src, size_t len)
* Poisons the shadow memory for 'size' bytes starting from 'addr'.
* Memory addresses should be aligned to KASAN_GRANULE_SIZE.
*/
-void poison_range(const void *address, size_t size, u8 value)
+void kasan_poison(const void *address, size_t size, u8 value)
{
void *shadow_start, *shadow_end;
@@ -89,8 +89,9 @@ void poison_range(const void *address, size_t size, u8 value)
__memset(shadow_start, value, shadow_end - shadow_start);
}
+EXPORT_SYMBOL(kasan_poison);
-void unpoison_range(const void *address, size_t size)
+void kasan_unpoison(const void *address, size_t size)
{
u8 tag = get_tag(address);
@@ -101,7 +102,7 @@ void unpoison_range(const void *address, size_t size)
*/
address = kasan_reset_tag(address);
- poison_range(address, size, tag);
+ kasan_poison(address, size, tag);
if (size & KASAN_GRANULE_MASK) {
u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
@@ -286,7 +287,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
* // vmalloc() allocates memory
* // let a = area->addr
* // we reach kasan_populate_vmalloc
- * // and call unpoison_range:
+ * // and call kasan_unpoison:
* STORE shadow(a), unpoison_val
* ...
* STORE shadow(a+99), unpoison_val x = LOAD p
@@ -321,7 +322,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size)
return;
size = round_up(size, KASAN_GRANULE_SIZE);
- poison_range(start, size, KASAN_VMALLOC_INVALID);
+ kasan_poison(start, size, KASAN_VMALLOC_INVALID);
}
void kasan_unpoison_vmalloc(const void *start, unsigned long size)
@@ -329,7 +330,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size)
if (!is_vmalloc_or_module_addr(start))
return;
- unpoison_range(start, size);
+ kasan_unpoison(start, size);
}
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 5dcd830805b2..94c2d33be333 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -57,7 +57,7 @@ void __init kasan_init_sw_tags(void)
* sequence has in fact positive effect, since interrupts that randomly skew
* PRNG at unpredictable points do only good.
*/
-u8 random_tag(void)
+u8 kasan_random_tag(void)
{
u32 state = this_cpu_read(prng_state);
@@ -67,7 +67,7 @@ u8 random_tag(void)
return (u8)(state % (KASAN_TAG_MAX + 1));
}
-bool check_memory_region(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(unsigned long addr, size_t size, bool write,
unsigned long ret_ip)
{
u8 tag;
@@ -118,24 +118,24 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
return true;
}
-bool check_invalid_free(void *addr)
+bool kasan_byte_accessible(const void *addr)
{
u8 tag = get_tag(addr);
u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
- return (shadow_byte == KASAN_TAG_INVALID) ||
- (tag != KASAN_TAG_KERNEL && tag != shadow_byte);
+ return (shadow_byte != KASAN_TAG_INVALID) &&
+ (tag == KASAN_TAG_KERNEL || tag == shadow_byte);
}
#define DEFINE_HWASAN_LOAD_STORE(size) \
void __hwasan_load##size##_noabort(unsigned long addr) \
{ \
- check_memory_region(addr, size, false, _RET_IP_); \
+ kasan_check_range(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
void __hwasan_store##size##_noabort(unsigned long addr) \
{ \
- check_memory_region(addr, size, true, _RET_IP_); \
+ kasan_check_range(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__hwasan_store##size##_noabort)
@@ -147,19 +147,19 @@ DEFINE_HWASAN_LOAD_STORE(16);
void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
{
- check_memory_region(addr, size, false, _RET_IP_);
+ kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_loadN_noabort);
void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
{
- check_memory_region(addr, size, true, _RET_IP_);
+ kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_storeN_noabort);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
{
- poison_range((void *)addr, size, tag);
+ kasan_poison((void *)addr, size, tag);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fb0fdaec34d5..75e246f680f4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1643,6 +1643,7 @@ static void collapse_file(struct mm_struct *mm,
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
+ int nr;
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
@@ -1854,11 +1855,12 @@ out_unlock:
put_page(page);
goto xa_unlocked;
}
+ nr = thp_nr_pages(new_page);
if (is_shmem)
- __inc_lruvec_page_state(new_page, NR_SHMEM_THPS);
+ __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
else {
- __inc_lruvec_page_state(new_page, NR_FILE_THPS);
+ __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
filemap_nr_thps_inc(mapping);
}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fe230081690b..6f067b6b935f 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -373,21 +373,13 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
struct list_lru_memcg *memcg_lrus;
/*
* This is called when shrinker has already been unregistered,
- * and nobody can use it. So, there is no need to use kvfree_rcu_local().
+ * and nobody can use it. So, there is no need to use kvfree_rcu().
*/
memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
kvfree(memcg_lrus);
}
-static void kvfree_rcu_local(struct rcu_head *head)
-{
- struct list_lru_memcg *mlru;
-
- mlru = container_of(head, struct list_lru_memcg, rcu);
- kvfree(mlru);
-}
-
static int memcg_update_list_lru_node(struct list_lru_node *nlru,
int old_size, int new_size)
{
@@ -419,7 +411,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
rcu_assign_pointer(nlru->memcg_lrus, new);
spin_unlock_irq(&nlru->lock);
- call_rcu(&old->rcu, kvfree_rcu_local);
+ kvfree_rcu(old, rcu);
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b9bd354e97e..845eec01ef9d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -255,6 +255,11 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
#ifdef CONFIG_MEMCG_KMEM
extern spinlock_t css_set_lock;
+static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned int nr_pages);
+static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
+ unsigned int nr_pages);
+
static void obj_cgroup_release(struct percpu_ref *ref)
{
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
@@ -447,8 +452,7 @@ static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
for_each_node(nid) {
pn = mem_cgroup_nodeinfo(memcg, nid);
map = rcu_dereference_protected(pn->shrinker_map, true);
- if (map)
- kvfree(map);
+ kvfree(map);
rcu_assign_pointer(pn->shrinker_map, NULL);
}
}
@@ -1043,29 +1047,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
}
EXPORT_SYMBOL(get_mem_cgroup_from_mm);
-/**
- * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
- * @page: page from which memcg should be extracted.
- *
- * Obtain a reference on page->memcg and returns it if successful. Otherwise
- * root_mem_cgroup is returned.
- */
-struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
-{
- struct mem_cgroup *memcg = page_memcg(page);
-
- if (mem_cgroup_disabled())
- return NULL;
-
- rcu_read_lock();
- /* Page should not get uncharged and freed memcg under us. */
- if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
- rcu_read_unlock();
- return memcg;
-}
-EXPORT_SYMBOL(get_mem_cgroup_from_page);
-
static __always_inline struct mem_cgroup *active_memcg(void)
{
if (in_interrupt())
@@ -1080,13 +1061,9 @@ static __always_inline struct mem_cgroup *get_active_memcg(void)
rcu_read_lock();
memcg = active_memcg();
- if (memcg) {
- /* current->active_memcg must hold a ref. */
- if (WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
- else
- memcg = current->active_memcg;
- }
+ /* remote memcg must hold a ref. */
+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
+ memcg = root_mem_cgroup;
rcu_read_unlock();
return memcg;
@@ -1346,20 +1323,19 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
* lock_page_lruvec - lock and return lruvec for a given page.
* @page: the page
*
- * This series functions should be used in either conditions:
- * PageLRU is cleared or unset
- * or page->_refcount is zero
- * or page is locked.
+ * These functions are safe to use under any of the following conditions:
+ * - page locked
+ * - PageLRU cleared
+ * - lock_page_memcg()
+ * - page->_refcount is zero
*/
struct lruvec *lock_page_lruvec(struct page *page)
{
struct lruvec *lruvec;
struct pglist_data *pgdat = page_pgdat(page);
- rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
spin_lock(&lruvec->lru_lock);
- rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
@@ -1371,10 +1347,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page)
struct lruvec *lruvec;
struct pglist_data *pgdat = page_pgdat(page);
- rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
spin_lock_irq(&lruvec->lru_lock);
- rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
@@ -1386,10 +1360,8 @@ struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
struct lruvec *lruvec;
struct pglist_data *pgdat = page_pgdat(page);
- rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
spin_lock_irqsave(&lruvec->lru_lock, *flags);
- rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
@@ -1512,72 +1484,73 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
struct memory_stat {
const char *name;
- unsigned int ratio;
unsigned int idx;
};
-static struct memory_stat memory_stats[] = {
- { "anon", PAGE_SIZE, NR_ANON_MAPPED },
- { "file", PAGE_SIZE, NR_FILE_PAGES },
- { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
- { "pagetables", PAGE_SIZE, NR_PAGETABLE },
- { "percpu", 1, MEMCG_PERCPU_B },
- { "sock", PAGE_SIZE, MEMCG_SOCK },
- { "shmem", PAGE_SIZE, NR_SHMEM },
- { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
- { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
- { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+static const struct memory_stat memory_stats[] = {
+ { "anon", NR_ANON_MAPPED },
+ { "file", NR_FILE_PAGES },
+ { "kernel_stack", NR_KERNEL_STACK_KB },
+ { "pagetables", NR_PAGETABLE },
+ { "percpu", MEMCG_PERCPU_B },
+ { "sock", MEMCG_SOCK },
+ { "shmem", NR_SHMEM },
+ { "file_mapped", NR_FILE_MAPPED },
+ { "file_dirty", NR_FILE_DIRTY },
+ { "file_writeback", NR_WRITEBACK },
+#ifdef CONFIG_SWAP
+ { "swapcached", NR_SWAPCACHE },
+#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /*
- * The ratio will be initialized in memory_stats_init(). Because
- * on some architectures, the macro of HPAGE_PMD_SIZE is not
- * constant(e.g. powerpc).
- */
- { "anon_thp", 0, NR_ANON_THPS },
- { "file_thp", 0, NR_FILE_THPS },
- { "shmem_thp", 0, NR_SHMEM_THPS },
+ { "anon_thp", NR_ANON_THPS },
+ { "file_thp", NR_FILE_THPS },
+ { "shmem_thp", NR_SHMEM_THPS },
#endif
- { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
- { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
- { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
- { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
- { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
-
- /*
- * Note: The slab_reclaimable and slab_unreclaimable must be
- * together and slab_reclaimable must be in front.
- */
- { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
- { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+ { "inactive_anon", NR_INACTIVE_ANON },
+ { "active_anon", NR_ACTIVE_ANON },
+ { "inactive_file", NR_INACTIVE_FILE },
+ { "active_file", NR_ACTIVE_FILE },
+ { "unevictable", NR_UNEVICTABLE },
+ { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
/* The memory events */
- { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
- { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
- { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
- { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
- { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
- { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
- { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+ { "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
};
-static int __init memory_stats_init(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memory_stats[i].idx == NR_ANON_THPS ||
- memory_stats[i].idx == NR_FILE_THPS ||
- memory_stats[i].idx == NR_SHMEM_THPS)
- memory_stats[i].ratio = HPAGE_PMD_SIZE;
-#endif
- VM_BUG_ON(!memory_stats[i].ratio);
- VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+/* Translate stat items to the correct unit for memory.stat output */
+static int memcg_page_state_unit(int item)
+{
+ switch (item) {
+ case MEMCG_PERCPU_B:
+ case NR_SLAB_RECLAIMABLE_B:
+ case NR_SLAB_UNRECLAIMABLE_B:
+ case WORKINGSET_REFAULT_ANON:
+ case WORKINGSET_REFAULT_FILE:
+ case WORKINGSET_ACTIVATE_ANON:
+ case WORKINGSET_ACTIVATE_FILE:
+ case WORKINGSET_RESTORE_ANON:
+ case WORKINGSET_RESTORE_FILE:
+ case WORKINGSET_NODERECLAIM:
+ return 1;
+ case NR_KERNEL_STACK_KB:
+ return SZ_1K;
+ default:
+ return PAGE_SIZE;
}
+}
- return 0;
+static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
+ int item)
+{
+ return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-pure_initcall(memory_stats_init);
static char *memory_stat_format(struct mem_cgroup *memcg)
{
@@ -1602,13 +1575,12 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
- size = memcg_page_state(memcg, memory_stats[i].idx);
- size *= memory_stats[i].ratio;
+ size = memcg_page_state_output(memcg, memory_stats[i].idx);
seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
- size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ size += memcg_page_state_output(memcg,
+ NR_SLAB_RECLAIMABLE_B);
seq_buf_printf(&s, "slab %llu\n", size);
}
}
@@ -2935,9 +2907,10 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp)
+ gfp_t gfp, bool new_page)
{
unsigned int objects = objs_per_slab_page(s, page);
+ unsigned long memcg_data;
void *vec;
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
@@ -2945,11 +2918,25 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
if (!vec)
return -ENOMEM;
- if (!set_page_objcgs(page, vec))
+ memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
+ if (new_page) {
+ /*
+ * If the slab page is brand new and nobody can yet access
+ * it's memcg_data, no synchronization is required and
+ * memcg_data can be simply assigned.
+ */
+ page->memcg_data = memcg_data;
+ } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
+ /*
+ * If the slab page is already in use, somebody can allocate
+ * and assign obj_cgroups in parallel. In this case the existing
+ * objcg vector should be reused.
+ */
kfree(vec);
- else
- kmemleak_not_leak(vec);
+ return 0;
+ }
+ kmemleak_not_leak(vec);
return 0;
}
@@ -3077,8 +3064,8 @@ static void memcg_free_cache_id(int id)
*
* Returns 0 on success, an error code on failure.
*/
-int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages)
+static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned int nr_pages)
{
struct page_counter *counter;
int ret;
@@ -3110,7 +3097,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
* @memcg: memcg to uncharge
* @nr_pages: number of pages to uncharge
*/
-void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
@@ -4072,10 +4059,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memcg1_stats[i] == NR_ANON_THPS)
- nr *= HPAGE_PMD_NR;
-#endif
seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
}
@@ -4106,10 +4089,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memcg1_stats[i] == NR_ANON_THPS)
- nr *= HPAGE_PMD_NR;
-#endif
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
(u64)nr * PAGE_SIZE);
}
@@ -5193,7 +5172,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
return 1;
}
- pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_cpu) {
free_percpu(pn->lruvec_stat_local);
@@ -5642,7 +5621,6 @@ static int mem_cgroup_move_account(struct page *page,
__mod_lruvec_state(to_vec, NR_ANON_THPS,
nr_pages);
}
-
}
} else {
__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
@@ -6393,6 +6371,12 @@ static int memory_stat_show(struct seq_file *m, void *v)
}
#ifdef CONFIG_NUMA
+static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
+ int item)
+{
+ return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
+}
+
static int memory_numa_stat_show(struct seq_file *m, void *v)
{
int i;
@@ -6410,8 +6394,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- size = lruvec_page_state(lruvec, memory_stats[i].idx);
- size *= memory_stats[i].ratio;
+ size = lruvec_page_state_output(lruvec,
+ memory_stats[i].idx);
seq_printf(m, " N%d=%llu", nid, size);
}
seq_putc(m, '\n');
@@ -6760,7 +6744,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
memcg_check_events(memcg, page);
local_irq_enable();
- if (PageSwapCache(page)) {
+ /*
+ * Cgroup1's unified memory+swap counter has been charged with the
+ * new swapcache page, finish the transfer by uncharging the swap
+ * slot. The swap slot would also get uncharged when it dies, but
+ * it can stick around indefinitely and we'd count the page twice
+ * the entire time.
+ *
+ * Cgroup2 has separate resource counters for memory and swap,
+ * so this is a non-issue here. Memory and swap charge lifetimes
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
+ * page to memory here, and uncharge swap when the slot is freed.
+ */
+ if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
* The swap entry might not get freed for a long time,
@@ -6851,31 +6847,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
css_put(&ug->memcg->css);
}
-static void uncharge_list(struct list_head *page_list)
-{
- struct uncharge_gather ug;
- struct list_head *next;
-
- uncharge_gather_clear(&ug);
-
- /*
- * Note that the list can be a single page->lru; hence the
- * do-while loop instead of a simple list_for_each_entry().
- */
- next = page_list->next;
- do {
- struct page *page;
-
- page = list_entry(next, struct page, lru);
- next = page->lru.next;
-
- uncharge_page(page, &ug);
- } while (next != page_list);
-
- if (ug.memcg)
- uncharge_batch(&ug);
-}
-
/**
* mem_cgroup_uncharge - uncharge a page
* @page: page to uncharge
@@ -6907,11 +6878,17 @@ void mem_cgroup_uncharge(struct page *page)
*/
void mem_cgroup_uncharge_list(struct list_head *page_list)
{
+ struct uncharge_gather ug;
+ struct page *page;
+
if (mem_cgroup_disabled())
return;
- if (!list_empty(page_list))
- uncharge_list(page_list);
+ uncharge_gather_clear(&ug);
+ list_for_each_entry(page, page_list, lru)
+ uncharge_page(page, &ug);
+ if (ug.memcg)
+ uncharge_batch(&ug);
}
/**
@@ -7078,6 +7055,14 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
+ /*
+ * Currently s32 type (can refer to struct batched_lruvec_stat) is
+ * used for per-memcg-per-cpu caching of per-node statistics. In order
+ * to work fine, we should make sure that the overfill threshold can't
+ * exceed S32_MAX / PAGE_SIZE.
+ */
+ BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
+
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e9481632fcd1..55c671904aac 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -243,9 +243,13 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
pfn, t->comm, t->pid);
if (flags & MF_ACTION_REQUIRED) {
- WARN_ON_ONCE(t != current);
- ret = force_sig_mceerr(BUS_MCEERR_AR,
+ if (t == current)
+ ret = force_sig_mceerr(BUS_MCEERR_AR,
(void __user *)tk->addr, addr_lsb);
+ else
+ /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */
+ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
+ addr_lsb, t);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -440,26 +444,26 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
* Determine whether a given process is "early kill" process which expects
* to be signaled when some page under the process is hwpoisoned.
* Return task_struct of the dedicated thread (main thread unless explicitly
- * specified) if the process is "early kill," and otherwise returns NULL.
+ * specified) if the process is "early kill" and otherwise returns NULL.
*
- * Note that the above is true for Action Optional case, but not for Action
- * Required case where SIGBUS should sent only to the current thread.
+ * Note that the above is true for Action Optional case. For Action Required
+ * case, it's only meaningful to the current thread which need to be signaled
+ * with SIGBUS, this error is Action Optional for other non current
+ * processes sharing the same error page,if the process is "early kill", the
+ * task_struct of the dedicated thread will also be returned.
*/
static struct task_struct *task_early_kill(struct task_struct *tsk,
int force_early)
{
if (!tsk->mm)
return NULL;
- if (force_early) {
- /*
- * Comparing ->mm here because current task might represent
- * a subthread, while tsk always points to the main thread.
- */
- if (tsk->mm == current->mm)
- return current;
- else
- return NULL;
- }
+ /*
+ * Comparing ->mm here because current task might represent
+ * a subthread, while tsk always points to the main thread.
+ */
+ if (force_early && tsk->mm == current->mm)
+ return current;
+
return find_early_kill_thread(tsk);
}
diff --git a/mm/memory.c b/mm/memory.c
index 5da964079678..784249f3307b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2177,11 +2177,11 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
- pte_t *pte;
+ pte_t *pte, *mapped_pte;
spinlock_t *ptl;
int err = 0;
- pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
arch_enter_lazy_mmu_mode();
@@ -2195,7 +2195,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(pte - 1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
return err;
}
@@ -2394,18 +2394,18 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
- pte_t *pte;
+ pte_t *pte, *mapped_pte;
int err = 0;
spinlock_t *ptl;
if (create) {
- pte = (mm == &init_mm) ?
+ mapped_pte = pte = (mm == &init_mm) ?
pte_alloc_kernel_track(pmd, addr, mask) :
pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
} else {
- pte = (mm == &init_mm) ?
+ mapped_pte = pte = (mm == &init_mm) ?
pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl);
}
@@ -2428,7 +2428,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_leave_lazy_mmu_mode();
if (mm != &init_mm)
- pte_unmap_unlock(pte-1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
return err;
}
@@ -5177,17 +5177,19 @@ long copy_huge_page_from_user(struct page *dst_page,
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+ struct page *subpage = dst_page;
- for (i = 0; i < pages_per_huge_page; i++) {
+ for (i = 0; i < pages_per_huge_page;
+ i++, subpage = mem_map_next(subpage, dst_page, i)) {
if (allow_pagefault)
- page_kaddr = kmap(dst_page + i);
+ page_kaddr = kmap(subpage);
else
- page_kaddr = kmap_atomic(dst_page + i);
+ page_kaddr = kmap_atomic(subpage);
rc = copy_from_user(page_kaddr,
(const void __user *)(src + i * PAGE_SIZE),
PAGE_SIZE);
if (allow_pagefault)
- kunmap(dst_page + i);
+ kunmap(subpage);
else
kunmap_atomic(page_kaddr);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f9d57b9be8c7..abe43c1ae920 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -713,7 +713,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* expects the zone spans the pfn range. All the pages in the range
* are reserved so nobody should be touching them so we should be safe
*/
- memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
+ memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
MEMINIT_HOTPLUG, altmap, migratetype);
set_zone_contiguous(zone);
@@ -1260,7 +1260,14 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
if (!PageHuge(page))
continue;
head = compound_head(page);
- if (page_huge_active(head))
+ /*
+ * This test is racy as we hold no reference or lock. The
+ * hugetlb page could have been free'ed and head is no longer
+ * a hugetlb page before the following check. In such unlikely
+ * cases false positives and negatives are possible. Calling
+ * code must deal with these scenarios.
+ */
+ if (HPageMigratable(head))
goto found;
skip = compound_nr(head) - (page - head);
pfn += skip - 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2c3a86502053..ab51132547b8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -677,7 +677,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long flags = qp->flags;
/* range check first */
- VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
+ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
if (!qp->first) {
qp->first = vma;
@@ -875,6 +875,16 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
+ if (flags & MPOL_F_NUMA_BALANCING) {
+ if (new && new->mode == MPOL_BIND) {
+ new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
+ } else {
+ ret = -EINVAL;
+ mpol_put(new);
+ goto out;
+ }
+ }
+
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
mpol_put(new);
@@ -2486,6 +2496,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_BIND:
+ /* Optimize placement among multiple nodes via NUMA balancing */
+ if (pol->flags & MPOL_F_MORON) {
+ if (node_isset(thisnid, pol->v.nodes))
+ break;
+ goto out;
+ }
/*
* allows binding to multiple nodes.
diff --git a/mm/mempool.c b/mm/mempool.c
index 624ed51b060f..79959fac27d7 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element)
static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
- kasan_slab_free_mempool(element, _RET_IP_);
+ kasan_slab_free_mempool(element);
else if (pool->alloc == mempool_alloc_pages)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 20ca887ea769..62b81d5257aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -331,7 +331,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
if (!get_page_unless_zero(page))
goto out;
pte_unmap_unlock(ptep, ptl);
- put_and_wait_on_page_locked(page);
+ put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
return;
out:
pte_unmap_unlock(ptep, ptl);
@@ -365,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
if (!get_page_unless_zero(page))
goto unlock;
spin_unlock(ptl);
- put_and_wait_on_page_locked(page);
+ put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
return;
unlock:
spin_unlock(ptl);
@@ -500,6 +500,12 @@ int migrate_page_move_mapping(struct address_space *mapping,
__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
}
+#ifdef CONFIG_SWAP
+ if (PageSwapCache(page)) {
+ __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
+ }
+#endif
if (dirty && mapping_can_writeback(mapping)) {
__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
diff --git a/mm/mlock.c b/mm/mlock.c
index 55b3b3672977..73960bb3464d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -278,8 +278,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
*/
if (TestClearPageLRU(page)) {
lruvec = relock_page_lruvec_irq(page, lruvec);
- del_page_from_lru_list(page, lruvec,
- page_lru(page));
+ del_page_from_lru_list(page, lruvec);
continue;
} else
__munlock_isolation_failed(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index 90673febce6a..3f287599a7a3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -189,7 +189,6 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
struct list_head *uf);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
- unsigned long retval;
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
struct vm_area_struct *next;
@@ -281,9 +280,8 @@ success:
return brk;
out:
- retval = origbrk;
mmap_write_unlock(mm);
- return retval;
+ return origbrk;
}
static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ab709023e9aa..94188df1ee55 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -617,10 +617,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (tmp > end)
tmp = end;
- if (vma->vm_ops && vma->vm_ops->mprotect)
+ if (vma->vm_ops && vma->vm_ops->mprotect) {
error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
- if (error)
- goto out;
+ if (error)
+ goto out;
+ }
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
diff --git a/mm/mremap.c b/mm/mremap.c
index 47192691fe32..ec8f840399ed 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -593,6 +593,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
/* We always clear VM_LOCKED[ONFAULT] on the old vma */
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ /*
+ * anon_vma links of the old vma is no longer needed after its page
+ * table has been moved.
+ */
+ if (new_vma != vma && vma->vm_start == old_addr &&
+ vma->vm_end == (old_addr + old_len))
+ unlink_anon_vmas(vma);
+
/* Because we won't unmap we don't need to touch locked_vm */
return new_addr;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c9a33ffe38b7..9efaf430cfd3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -395,9 +395,8 @@ static int dump_task(struct task_struct *p, void *arg)
task = find_lock_task_mm(p);
if (!task) {
/*
- * This is a kthread or all of p's threads have already
- * detached their mm's. There's no need to report
- * them; they can't be oom killed anyway.
+ * All of p's threads have already detached their mm's. There's
+ * no need to report them; they can't be oom killed anyway.
*/
return 0;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef5070fed76b..ddccc59f2f72 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5584,10 +5584,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_WRITEBACK)),
K(node_page_state(pgdat, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
- K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
- * HPAGE_PMD_NR),
- K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_SHMEM_THPS)),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
+ K(node_page_state(pgdat, NR_ANON_THPS)),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
node_page_state(pgdat, NR_KERNEL_STACK_KB),
@@ -6122,7 +6121,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
* zone stats (e.g., nr_isolate_pageblock) are touched.
*/
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, unsigned long zone_end_pfn,
enum meminit_context context,
struct vmem_altmap *altmap, int migratetype)
@@ -6259,23 +6258,21 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-void __meminit __weak memmap_init(unsigned long size, int nid,
- unsigned long zone,
- unsigned long range_start_pfn)
+void __meminit __weak memmap_init_zone(struct zone *zone)
{
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+ int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
unsigned long start_pfn, end_pfn;
- unsigned long range_end_pfn = range_start_pfn + size;
- int i;
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
- end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
- if (end_pfn > start_pfn) {
- size = end_pfn - start_pfn;
- memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn,
- MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
- }
+ if (end_pfn > start_pfn)
+ memmap_init_range(end_pfn - start_pfn, nid,
+ zone_id, start_pfn, zone_end_pfn,
+ MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
}
}
@@ -6768,25 +6765,22 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
return usemapsize / 8;
}
-static void __ref setup_usemap(struct pglist_data *pgdat,
- struct zone *zone,
- unsigned long zone_start_pfn,
- unsigned long zonesize)
+static void __ref setup_usemap(struct zone *zone)
{
- unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
+ unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
+ zone->spanned_pages);
zone->pageblock_flags = NULL;
if (usemapsize) {
zone->pageblock_flags =
memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
- pgdat->node_id);
+ zone_to_nid(zone));
if (!zone->pageblock_flags)
panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
- usemapsize, zone->name, pgdat->node_id);
+ usemapsize, zone->name, zone_to_nid(zone));
}
}
#else
-static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
- unsigned long zone_start_pfn, unsigned long zonesize) {}
+static inline void setup_usemap(struct zone *zone) {}
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -6933,7 +6927,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, freesize, memmap_pages;
- unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
freesize = zone->present_pages;
@@ -6981,9 +6974,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
continue;
set_pageblock_order();
- setup_usemap(pgdat, zone, zone_start_pfn, size);
- init_currently_empty_zone(zone, zone_start_pfn, size);
- memmap_init(size, nid, j, zone_start_pfn);
+ setup_usemap(zone);
+ init_currently_empty_zone(zone, zone->zone_start_pfn, size);
+ memmap_init_zone(zone);
}
}
@@ -7698,17 +7691,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-#ifdef CONFIG_HIGHMEM
-void free_highmem_page(struct page *page)
-{
- __free_reserved_page(page);
- totalram_pages_inc();
- atomic_long_inc(&page_zone(page)->managed_pages);
- totalhigh_pages_inc();
-}
-#endif
-
-
void __init mem_init_print_info(const char *str)
{
unsigned long physpages, codesize, datasize, rosize, bss_size;
diff --git a/mm/page_io.c b/mm/page_io.c
index 92f7941c6d01..485fa5cca4a2 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -41,9 +41,9 @@ void end_swap_bio_write(struct bio *bio)
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
*/
set_page_dirty(page);
- pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
@@ -106,9 +106,9 @@ static void end_swap_bio_read(struct bio *bio)
if (bio->bi_status) {
SetPageError(page);
ClearPageUptodate(page);
- pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
goto out;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index af464bb7fbe7..d15c7c4994f5 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -263,8 +263,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
struct page *page;
struct page_ext *page_ext;
struct page_owner *page_owner;
- unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
- unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long pfn, block_end_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
unsigned long count[MIGRATE_TYPES] = { 0, };
int pageblock_mt, page_mt;
int i;
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index cd8e13d41df4..c50d93ffa252 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -211,7 +211,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
}
/* Rotate any leftover pages to the head of the freelist */
- if (&next->lru != list && !list_is_first(&next->lru, list))
+ if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
list_rotate_to_front(&next->lru, list);
spin_unlock_irq(&zone->lock);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 9578db83e312..c2210e1cdb51 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -135,8 +135,9 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
- !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ VM_BUG_ON(!pmd_present(*pmdp));
+ /* Below assumes pmd_present() is true */
+ VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
diff --git a/mm/rmap.c b/mm/rmap.c
index 08c56aaf72eb..e26ae119a131 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -413,8 +413,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
- if (vma->anon_vma)
+ if (vma->anon_vma) {
vma->anon_vma->degree--;
+
+ /*
+ * vma would still be needed after unlink, and anon_vma will be prepared
+ * when handle fault.
+ */
+ vma->anon_vma = NULL;
+ }
unlock_anon_vma_root(root);
/*
@@ -1144,7 +1151,7 @@ void do_page_add_anon_rmap(struct page *page,
* disabled.
*/
if (compound)
- __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
@@ -1186,7 +1193,7 @@ void page_add_new_anon_rmap(struct page *page,
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
- __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -1211,16 +1218,20 @@ void page_add_file_rmap(struct page *page, bool compound)
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+ int nr_pages = thp_nr_pages(page);
+
+ for (i = 0, nr = 0; i < nr_pages; i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
if (PageSwapBacked(page))
- __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
+ nr_pages);
else
- __inc_node_page_state(page, NR_FILE_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+ nr_pages);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1252,16 +1263,20 @@ static void page_remove_file_rmap(struct page *page, bool compound)
/* page still mapped by someone else? */
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+ int nr_pages = thp_nr_pages(page);
+
+ for (i = 0, nr = 0; i < nr_pages; i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
return;
if (PageSwapBacked(page))
- __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
+ -nr_pages);
else
- __dec_node_page_state(page, NR_FILE_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+ -nr_pages);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
return;
@@ -1292,7 +1307,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __dec_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
if (TestClearPageDoubleMap(page)) {
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 7924b3bf46fb..ff741d229701 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -713,7 +713,7 @@ next:
}
if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC);
- __inc_lruvec_page_state(page, NR_SHMEM_THPS);
+ __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
}
mapping->nrpages += nr;
__mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
diff --git a/mm/slab.c b/mm/slab.c
index dcc55e78f353..35c68d99d460 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -272,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
#define STATS_INC_GROWN(x) ((x)->grown++)
-#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
+#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y))
#define STATS_SET_HIGH(x) \
do { \
if ((x)->num_active > (x)->high_mark) \
@@ -296,7 +296,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define STATS_DEC_ACTIVE(x) do { } while (0)
#define STATS_INC_ALLOCED(x) do { } while (0)
#define STATS_INC_GROWN(x) do { } while (0)
-#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
+#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -332,7 +332,7 @@ static int obj_offset(struct kmem_cache *cachep)
static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
- return (unsigned long long*) (objp + obj_offset(cachep) -
+ return (unsigned long long *) (objp + obj_offset(cachep) -
sizeof(unsigned long long));
}
@@ -580,7 +580,7 @@ static int transfer_objects(struct array_cache *to,
if (!nr)
return 0;
- memcpy(to->entry + to->avail, from->entry + from->avail -nr,
+ memcpy(to->entry + to->avail, from->entry + from->avail - nr,
sizeof(void *) *nr);
from->avail -= nr;
@@ -1379,7 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- account_slab_page(page, cachep->gfporder, cachep);
+ account_slab_page(page, cachep->gfporder, cachep, flags);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
@@ -1790,8 +1790,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
}
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
return flags;
}
@@ -2738,7 +2737,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#else
#define kfree_debugcheck(x) do { } while(0)
-#define cache_free_debugcheck(x,objp,z) (objp)
+#define cache_free_debugcheck(x, objp, z) (objp)
#endif
static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
@@ -3025,7 +3024,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
return objp;
}
#else
-#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
+#define cache_alloc_debugcheck_after(a, b, objp, d) (objp)
#endif
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
@@ -3421,7 +3420,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
memset(objp, 0, cachep->object_size);
/* Put the object into the quarantine, don't touch it for now. */
- if (kasan_slab_free(cachep, objp, _RET_IP_))
+ if (kasan_slab_free(cachep, objp))
return;
/* Use KCSAN to help debug racy use-after-free. */
@@ -3717,7 +3716,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
__cache_free(cachep, objp, _RET_IP_);
local_irq_restore(flags);
- trace_kmem_cache_free(_RET_IP_, objp);
+ trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
}
EXPORT_SYMBOL(kmem_cache_free);
diff --git a/mm/slab.h b/mm/slab.h
index ecad9b57bc44..076582f58f68 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -110,8 +110,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *));
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *));
+ slab_flags_t flags, const char *name);
#else
static inline struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
@@ -119,8 +118,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
{ return NULL; }
static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
return flags;
}
@@ -240,7 +238,7 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp);
+ gfp_t gfp, bool new_page);
static inline void memcg_free_page_obj_cgroups(struct page *page)
{
@@ -317,7 +315,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
page = virt_to_head_page(p[i]);
if (!page_objcgs(page) &&
- memcg_alloc_page_obj_cgroups(page, s, flags)) {
+ memcg_alloc_page_obj_cgroups(page, s, flags,
+ false)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
@@ -381,7 +380,8 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
}
static inline int memcg_alloc_page_obj_cgroups(struct page *page,
- struct kmem_cache *s, gfp_t gfp)
+ struct kmem_cache *s, gfp_t gfp,
+ bool new_page)
{
return 0;
}
@@ -422,8 +422,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
}
static __always_inline void account_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+ struct kmem_cache *s,
+ gfp_t gfp)
{
+ if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
+ memcg_alloc_page_obj_cgroups(page, s, gfp, true);
+
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
PAGE_SIZE << order);
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index adbace4256ef..7c8298c17145 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -197,7 +197,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
size = ALIGN(size, sizeof(void *));
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
- flags = kmem_cache_flags(size, flags, name, NULL);
+ flags = kmem_cache_flags(size, flags, name);
if (flags & SLAB_NEVER_MERGE)
return NULL;
@@ -309,9 +309,6 @@ kmem_cache_create_usercopy(const char *name,
const char *cache_name;
int err;
- get_online_cpus();
- get_online_mems();
-
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
@@ -360,9 +357,6 @@ kmem_cache_create_usercopy(const char *name,
out_unlock:
mutex_unlock(&slab_mutex);
- put_online_mems();
- put_online_cpus();
-
if (err) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
@@ -486,9 +480,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (unlikely(!s))
return;
- get_online_cpus();
- get_online_mems();
-
mutex_lock(&slab_mutex);
s->refcount--;
@@ -503,9 +494,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
}
out_unlock:
mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -522,12 +510,10 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret;
- get_online_cpus();
- get_online_mems();
+
kasan_cache_shrink(cachep);
ret = __kmem_cache_shrink(cachep);
- put_online_mems();
- put_online_cpus();
+
return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -912,8 +898,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
page = alloc_pages(flags, order);
if (likely(page)) {
ret = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
@@ -1232,19 +1218,21 @@ size_t ksize(const void *objp)
size_t size;
/*
- * We need to check that the pointed to object is valid, and only then
- * unpoison the shadow memory below. We use __kasan_check_read(), to
- * generate a more useful report at the time ksize() is called (rather
- * than later where behaviour is undefined due to potential
- * use-after-free or double-free).
+ * We need to first check that the pointer to the object is valid, and
+ * only then unpoison the memory. The report printed from ksize() is
+ * more useful, then when it's printed later when the behaviour could
+ * be undefined due to a potential use-after-free or double-free.
+ *
+ * We use kasan_check_byte(), which is supported for the hardware
+ * tag-based KASAN mode, unlike kasan_check_read/write().
*
- * If the pointed to memory is invalid we return 0, to avoid users of
+ * If the pointed to memory is invalid, we return 0 to avoid users of
* ksize() writing to and potentially corrupting the memory region.
*
* We want to perform the check before __ksize(), to avoid potentially
* crashing in __ksize() due to accessing invalid metadata.
*/
- if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
return 0;
size = __ksize(objp);
diff --git a/mm/slob.c b/mm/slob.c
index ef87ada8705d..0578429b991b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -673,7 +673,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
__kmem_cache_free(b, c->size);
}
- trace_kmem_cache_free(_RET_IP_, b);
+ trace_kmem_cache_free(_RET_IP_, b, c->name);
}
EXPORT_SYMBOL(kmem_cache_free);
diff --git a/mm/slub.c b/mm/slub.c
index f5baf429654f..b2833ce85c92 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -235,6 +235,14 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
#endif
}
+/*
+ * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
+ * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
+ * differ during memory hotplug/hotremove operations.
+ * Protected by slab_mutex.
+ */
+static nodemask_t slab_nodes;
+
/********************************************************************
* Core slab cache functions
*******************************************************************/
@@ -1400,7 +1408,6 @@ __setup("slub_debug", setup_slub_debug);
* @object_size: the size of an object without meta data
* @flags: flags to set
* @name: name of the cache
- * @ctor: constructor function
*
* Debug option(s) are applied to @flags. In addition to the debug
* option(s), if a slab name (or multiple) is specified i.e.
@@ -1408,13 +1415,21 @@ __setup("slub_debug", setup_slub_debug);
* then only the select slabs will receive the debug option(s).
*/
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
char *iter;
size_t len;
char *next_block;
slab_flags_t block_flags;
+ slab_flags_t slub_debug_local = slub_debug;
+
+ /*
+ * If the slab cache is for debugging (e.g. kmemleak) then
+ * don't store user (stack trace) information by default,
+ * but let the user enable it via the command line below.
+ */
+ if (flags & SLAB_NOLEAKTRACE)
+ slub_debug_local &= ~SLAB_STORE_USER;
len = strlen(name);
next_block = slub_debug_string;
@@ -1449,7 +1464,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
}
}
- return flags | slub_debug;
+ return flags | slub_debug_local;
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
@@ -1474,8 +1489,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
return flags;
}
@@ -1514,7 +1528,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
static __always_inline void kfree_hook(void *x)
{
kmemleak_free(x);
- kasan_kfree_large(x, _RET_IP_);
+ kasan_kfree_large(x);
}
static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
@@ -1544,7 +1558,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
/* KASAN might put x into memory quarantine, delaying its reuse */
- return kasan_slab_free(s, x, _RET_IP_);
+ return kasan_slab_free(s, x);
}
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
@@ -1771,7 +1785,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->objects = oo_objects(oo);
- account_slab_page(page, oo_order(oo), s);
+ account_slab_page(page, oo_order(oo), s, flags);
page->slab_cache = s;
__SetPageSlab(page);
@@ -2153,9 +2167,9 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
{
enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- int lock = 0;
+ int lock = 0, free_delta = 0;
enum slab_modes l = M_NONE, m = M_NONE;
- void *nextfree;
+ void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
struct page new;
struct page old;
@@ -2166,45 +2180,34 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
}
/*
- * Stage one: Free all available per cpu objects back
- * to the page freelist while it is still frozen. Leave the
- * last one.
- *
- * There is no need to take the list->lock because the page
- * is still frozen.
+ * Stage one: Count the objects on cpu's freelist as free_delta and
+ * remember the last object in freelist_tail for later splicing.
*/
- while (freelist && (nextfree = get_freepointer(s, freelist))) {
- void *prior;
- unsigned long counters;
+ freelist_tail = NULL;
+ freelist_iter = freelist;
+ while (freelist_iter) {
+ nextfree = get_freepointer(s, freelist_iter);
/*
* If 'nextfree' is invalid, it is possible that the object at
- * 'freelist' is already corrupted. So isolate all objects
- * starting at 'freelist'.
+ * 'freelist_iter' is already corrupted. So isolate all objects
+ * starting at 'freelist_iter' by skipping them.
*/
- if (freelist_corrupted(s, page, &freelist, nextfree))
+ if (freelist_corrupted(s, page, &freelist_iter, nextfree))
break;
- do {
- prior = page->freelist;
- counters = page->counters;
- set_freepointer(s, freelist, prior);
- new.counters = counters;
- new.inuse--;
- VM_BUG_ON(!new.frozen);
-
- } while (!__cmpxchg_double_slab(s, page,
- prior, counters,
- freelist, new.counters,
- "drain percpu freelist"));
+ freelist_tail = freelist_iter;
+ free_delta++;
- freelist = nextfree;
+ freelist_iter = nextfree;
}
/*
- * Stage two: Ensure that the page is unfrozen while the
- * list presence reflects the actual number of objects
- * during unfreeze.
+ * Stage two: Unfreeze the page while splicing the per-cpu
+ * freelist to the head of page's freelist.
+ *
+ * Ensure that the page is unfrozen while the list presence
+ * reflects the actual number of objects during unfreeze.
*
* We setup the list membership and then perform a cmpxchg
* with the count. If there is a mismatch then the page
@@ -2217,15 +2220,15 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
*/
redo:
- old.freelist = page->freelist;
- old.counters = page->counters;
+ old.freelist = READ_ONCE(page->freelist);
+ old.counters = READ_ONCE(page->counters);
VM_BUG_ON(!old.frozen);
/* Determine target state of the slab */
new.counters = old.counters;
- if (freelist) {
- new.inuse--;
- set_freepointer(s, freelist, old.freelist);
+ if (freelist_tail) {
+ new.inuse -= free_delta;
+ set_freepointer(s, freelist_tail, old.freelist);
new.freelist = freelist;
} else
new.freelist = old.freelist;
@@ -2672,7 +2675,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
* ignore the node constraint
*/
if (unlikely(node != NUMA_NO_NODE &&
- !node_state(node, N_NORMAL_MEMORY)))
+ !node_isset(node, slab_nodes)))
node = NUMA_NO_NODE;
goto new_slab;
}
@@ -2683,7 +2686,7 @@ redo:
* same as above but node_match() being false already
* implies node != NUMA_NO_NODE
*/
- if (!node_state(node, N_NORMAL_MEMORY)) {
+ if (!node_isset(node, slab_nodes)) {
node = NUMA_NO_NODE;
goto redo;
} else {
@@ -3157,7 +3160,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
if (!s)
return;
slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
- trace_kmem_cache_free(_RET_IP_, x);
+ trace_kmem_cache_free(_RET_IP_, x, s->name);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3266,7 +3269,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
if (!df.page)
continue;
- slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
+ slab_free(df.s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
} while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3586,7 +3589,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
- for_each_node_state(node, N_NORMAL_MEMORY) {
+ for_each_node_mask(node, slab_nodes) {
struct kmem_cache_node *n;
if (slab_state == DOWN) {
@@ -3797,7 +3800,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
- s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
+ s->flags = kmem_cache_flags(s->size, flags, s->name);
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
#endif
@@ -4039,8 +4042,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
page = alloc_pages_node(node, flags, order);
if (page) {
ptr = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
return kmalloc_large_node_hook(ptr, size, flags);
@@ -4171,8 +4174,8 @@ void kfree(const void *x)
BUG_ON(!PageCompound(page));
kfree_hook(object);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(page, order);
return;
}
@@ -4267,8 +4270,6 @@ static int slab_mem_going_offline_callback(void *arg)
static void slab_mem_offline_callback(void *arg)
{
- struct kmem_cache_node *n;
- struct kmem_cache *s;
struct memory_notify *marg = arg;
int offline_node;
@@ -4282,21 +4283,12 @@ static void slab_mem_offline_callback(void *arg)
return;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- n = get_node(s, offline_node);
- if (n) {
- /*
- * if n->nr_slabs > 0, slabs still exist on the node
- * that is going down. We were unable to free them,
- * and offline_pages() function shouldn't call this
- * callback. So, we must fail.
- */
- BUG_ON(slabs_node(s, offline_node));
-
- s->node[offline_node] = NULL;
- kmem_cache_free(kmem_cache_node, n);
- }
- }
+ node_clear(offline_node, slab_nodes);
+ /*
+ * We no longer free kmem_cache_node structures here, as it would be
+ * racy with all get_node() users, and infeasible to protect them with
+ * slab_mutex.
+ */
mutex_unlock(&slab_mutex);
}
@@ -4323,6 +4315,12 @@ static int slab_mem_going_online_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
/*
+ * The structure may already exist if the node was previously
+ * onlined and offlined.
+ */
+ if (get_node(s, nid))
+ continue;
+ /*
* XXX: kmem_cache_alloc_node will fallback to other nodes
* since memory is not yet available from the node that
* is brought up.
@@ -4335,6 +4333,11 @@ static int slab_mem_going_online_callback(void *arg)
init_kmem_cache_node(n);
s->node[nid] = n;
}
+ /*
+ * Any cache created after this point will also have kmem_cache_node
+ * initialized for the new node.
+ */
+ node_set(nid, slab_nodes);
out:
mutex_unlock(&slab_mutex);
return ret;
@@ -4415,6 +4418,7 @@ void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node;
+ int node;
if (debug_guardpage_minorder())
slub_max_order = 0;
@@ -4422,6 +4426,13 @@ void __init kmem_cache_init(void)
kmem_cache_node = &boot_kmem_cache_node;
kmem_cache = &boot_kmem_cache;
+ /*
+ * Initialize the nodemask for which we will allocate per node
+ * structures. Here we don't need taking slab_mutex yet.
+ */
+ for_each_node_state(node, N_NORMAL_MEMORY)
+ node_set(node, slab_nodes);
+
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
@@ -4932,22 +4943,6 @@ enum slab_stat_type {
#define SO_OBJECTS (1 << SL_OBJECTS)
#define SO_TOTAL (1 << SL_TOTAL)
-#ifdef CONFIG_MEMCG
-static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
-
-static int __init setup_slub_memcg_sysfs(char *str)
-{
- int v;
-
- if (get_option(&str, &v) > 0)
- memcg_sysfs_enabled = v;
-
- return 1;
-}
-
-__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
-#endif
-
static ssize_t show_slab_objects(struct kmem_cache *s,
char *buf, unsigned long flags)
{
diff --git a/mm/swap.c b/mm/swap.c
index 2cca7141470c..ab3258afcbeb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -83,9 +83,8 @@ static void __page_cache_release(struct page *page)
unsigned long flags;
lruvec = lock_page_lruvec_irqsave(page, &flags);
- VM_BUG_ON_PAGE(!PageLRU(page), page);
- __ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_off_lru(page));
+ del_page_from_lru_list(page, lruvec);
+ __clear_page_lru_flags(page);
unlock_page_lruvec_irqrestore(lruvec, flags);
}
__ClearPageWaiters(page);
@@ -229,9 +228,9 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
{
if (!PageUnevictable(page)) {
- del_page_from_lru_list(page, lruvec, page_lru(page));
+ del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
- add_page_to_lru_list_tail(page, lruvec, page_lru(page));
+ add_page_to_lru_list_tail(page, lruvec);
__count_vm_events(PGROTATED, thp_nr_pages(page));
}
}
@@ -308,13 +307,11 @@ void lru_note_cost_page(struct page *page)
static void __activate_page(struct page *page, struct lruvec *lruvec)
{
if (!PageActive(page) && !PageUnevictable(page)) {
- int lru = page_lru_base_type(page);
int nr_pages = thp_nr_pages(page);
- del_page_from_lru_list(page, lruvec, lru);
+ del_page_from_lru_list(page, lruvec);
SetPageActive(page);
- lru += LRU_ACTIVE;
- add_page_to_lru_list(page, lruvec, lru);
+ add_page_to_lru_list(page, lruvec);
trace_mm_lru_activate(page);
__count_vm_events(PGACTIVATE, nr_pages);
@@ -519,8 +516,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
*/
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
{
- int lru;
- bool active;
+ bool active = PageActive(page);
int nr_pages = thp_nr_pages(page);
if (PageUnevictable(page))
@@ -530,10 +526,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
if (page_mapped(page))
return;
- active = PageActive(page);
- lru = page_lru_base_type(page);
-
- del_page_from_lru_list(page, lruvec, lru + active);
+ del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
ClearPageReferenced(page);
@@ -543,14 +536,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
* It can make readahead confusing. But race window
* is _really_ small and it's non-critical problem.
*/
- add_page_to_lru_list(page, lruvec, lru);
+ add_page_to_lru_list(page, lruvec);
SetPageReclaim(page);
} else {
/*
* The page's writeback ends up during pagevec
* We moves tha page into tail of inactive.
*/
- add_page_to_lru_list_tail(page, lruvec, lru);
+ add_page_to_lru_list_tail(page, lruvec);
__count_vm_events(PGROTATED, nr_pages);
}
@@ -564,13 +557,12 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
if (PageActive(page) && !PageUnevictable(page)) {
- int lru = page_lru_base_type(page);
int nr_pages = thp_nr_pages(page);
- del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
+ add_page_to_lru_list(page, lruvec);
__count_vm_events(PGDEACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
@@ -582,11 +574,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
{
if (PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
- bool active = PageActive(page);
int nr_pages = thp_nr_pages(page);
- del_page_from_lru_list(page, lruvec,
- LRU_INACTIVE_ANON + active);
+ del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
ClearPageReferenced(page);
/*
@@ -595,7 +585,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
* anonymous pages
*/
ClearPageSwapBacked(page);
- add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
+ add_page_to_lru_list(page, lruvec);
__count_vm_events(PGLAZYFREE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
@@ -918,9 +908,8 @@ void release_pages(struct page **pages, int nr)
if (prev_lruvec != lruvec)
lock_batch = 0;
- VM_BUG_ON_PAGE(!PageLRU(page), page);
- __ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_off_lru(page));
+ del_page_from_lru_list(page, lruvec);
+ __clear_page_lru_flags(page);
}
__ClearPageWaiters(page);
@@ -958,7 +947,6 @@ EXPORT_SYMBOL(__pagevec_release);
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
{
- enum lru_list lru;
int was_unevictable = TestClearPageUnevictable(page);
int nr_pages = thp_nr_pages(page);
@@ -994,19 +982,17 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
smp_mb__after_atomic();
if (page_evictable(page)) {
- lru = page_lru(page);
if (was_unevictable)
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
- lru = LRU_UNEVICTABLE;
ClearPageActive(page);
SetPageUnevictable(page);
if (!was_unevictable)
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
- add_page_to_lru_list(page, lruvec, lru);
- trace_mm_lru_insertion(page, lru);
+ add_page_to_lru_list(page, lruvec);
+ trace_mm_lru_insertion(page);
}
/*
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0357fbe70645..be9de6d5b516 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -193,8 +193,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
cache->slots_ret = NULL;
}
spin_unlock_irq(&cache->free_lock);
- if (slots)
- kvfree(slots);
+ kvfree(slots);
}
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 751c1ef2fe0e..c1a648d9092b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -68,32 +68,6 @@ static struct {
unsigned long find_total;
} swap_cache_info;
-unsigned long total_swapcache_pages(void)
-{
- unsigned int i, j, nr;
- unsigned long ret = 0;
- struct address_space *spaces;
- struct swap_info_struct *si;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- swp_entry_t entry = swp_entry(i, 1);
-
- /* Avoid get_swap_device() to warn for bad swap entry */
- if (!swp_swap_info(entry))
- continue;
- /* Prevent swapoff to free swapper_spaces */
- si = get_swap_device(entry);
- if (!si)
- continue;
- nr = nr_swapper_spaces[i];
- spaces = swapper_spaces[i];
- for (j = 0; j < nr; j++)
- ret += spaces[j].nrpages;
- put_swap_device(si);
- }
- return ret;
-}
-
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
void show_swap_cache_info(void)
@@ -163,6 +137,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
address_space->nrexceptional -= nr_shadows;
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+ __mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
ADD_CACHE_INFO(add_total, nr);
unlock:
xas_unlock_irq(&xas);
@@ -203,6 +178,7 @@ void __delete_from_swap_cache(struct page *page,
address_space->nrexceptional += nr;
address_space->nrpages -= nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
ADD_CACHE_INFO(del_total, nr);
}
@@ -537,7 +513,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
workingset_refault(page, shadow);
/* Caller will initiate read into locked page */
- SetPageWorkingset(page);
lru_cache_add(page);
*new_page_allocated = true;
return page;
@@ -927,7 +902,7 @@ static struct attribute *swap_attrs[] = {
NULL,
};
-static struct attribute_group swap_attr_group = {
+static const struct attribute_group swap_attr_group = {
.attrs = swap_attrs,
};
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 96799a2f6957..f039745989d2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1157,13 +1157,13 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
return p;
bad_offset:
- pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
goto out;
bad_device:
- pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
goto out;
bad_nofile:
- pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
return NULL;
}
@@ -1180,7 +1180,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
return p;
bad_free:
- pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
out:
return NULL;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1b574ad199d..562e87cbd7a1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -310,7 +310,8 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
* @lru: lru to use
* @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
*/
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
+static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int zone_idx)
{
unsigned long size = 0;
int zid;
@@ -1539,19 +1540,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
* page: page to consider
* mode: one of the LRU isolation modes defined above
*
- * returns 0 on success, -ve errno on failure.
+ * returns true on success, false on failure.
*/
-int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
+bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
{
- int ret = -EBUSY;
-
/* Only take pages on the LRU. */
if (!PageLRU(page))
- return ret;
+ return false;
/* Compaction should not handle unevictable pages but CMA can do so */
if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
- return ret;
+ return false;
/*
* To minimise LRU disruption, the caller can indicate that it only
@@ -1564,7 +1563,7 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
if (mode & ISOLATE_ASYNC_MIGRATE) {
/* All the caller can do on PageWriteback is block */
if (PageWriteback(page))
- return ret;
+ return false;
if (PageDirty(page)) {
struct address_space *mapping;
@@ -1580,20 +1579,20 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
* from the page cache.
*/
if (!trylock_page(page))
- return ret;
+ return false;
mapping = page_mapping(page);
migrate_dirty = !mapping || mapping->a_ops->migratepage;
unlock_page(page);
if (!migrate_dirty)
- return ret;
+ return false;
}
}
if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
- return ret;
+ return false;
- return 0;
+ return true;
}
/*
@@ -1677,35 +1676,31 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* only when the page is being freed somewhere else.
*/
scan += nr_pages;
- switch (__isolate_lru_page_prepare(page, mode)) {
- case 0:
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- if (unlikely(!get_page_unless_zero(page)))
- goto busy;
-
- if (!TestClearPageLRU(page)) {
- /*
- * This page may in other isolation path,
- * but we still hold lru_lock.
- */
- put_page(page);
- goto busy;
- }
-
- nr_taken += nr_pages;
- nr_zone_taken[page_zonenum(page)] += nr_pages;
- list_move(&page->lru, dst);
- break;
+ if (!__isolate_lru_page_prepare(page, mode)) {
+ /* It is being freed elsewhere */
+ list_move(&page->lru, src);
+ continue;
+ }
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ if (unlikely(!get_page_unless_zero(page))) {
+ list_move(&page->lru, src);
+ continue;
+ }
- default:
-busy:
- /* else it is being freed elsewhere */
+ if (!TestClearPageLRU(page)) {
+ /* Another thread is already isolating this page */
+ put_page(page);
list_move(&page->lru, src);
+ continue;
}
+
+ nr_taken += nr_pages;
+ nr_zone_taken[page_zonenum(page)] += nr_pages;
+ list_move(&page->lru, dst);
}
/*
@@ -1772,7 +1767,7 @@ int isolate_lru_page(struct page *page)
get_page(page);
lruvec = lock_page_lruvec_irq(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
+ del_page_from_lru_list(page, lruvec);
unlock_page_lruvec_irq(lruvec);
ret = 0;
}
@@ -1829,7 +1824,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
struct page *page;
- enum lru_list lru;
while (!list_empty(list)) {
page = lru_to_page(list);
@@ -1856,8 +1850,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
SetPageLRU(page);
if (unlikely(put_page_testzero(page))) {
- __ClearPageLRU(page);
- __ClearPageActive(page);
+ __clear_page_lru_flags(page);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&lruvec->lru_lock);
@@ -1874,11 +1867,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
* inhibits memcg migration).
*/
VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
- lru = page_lru(page);
+ add_page_to_lru_list(page, lruvec);
nr_pages = thp_nr_pages(page);
-
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_add(&page->lru, &lruvec->lists[lru]);
nr_moved += nr_pages;
if (PageActive(page))
workingset_age_nonresident(lruvec, nr_pages);
@@ -4095,8 +4085,13 @@ module_init(kswapd_init)
*/
int node_reclaim_mode __read_mostly;
-#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */
+/*
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
+ * ABI. New bits are OK, but existing bits can never change.
+ */
+#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
* Priority for NODE_RECLAIM. This determines the fraction of pages
@@ -4292,12 +4287,9 @@ void check_move_unevictable_pages(struct pagevec *pvec)
lruvec = relock_page_lruvec_irq(page, lruvec);
if (page_evictable(page) && PageUnevictable(page)) {
- enum lru_list lru = page_lru_base_type(page);
-
- VM_BUG_ON_PAGE(PageActive(page), page);
+ del_page_from_lru_list(page, lruvec);
ClearPageUnevictable(page);
- del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
- add_page_to_lru_list(page, lruvec, lru);
+ add_page_to_lru_list(page, lruvec);
pgrescued += nr_pages;
}
SetPageLRU(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8942160fc95..a0e949542204 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1215,6 +1215,9 @@ const char * const vmstat_text[] = {
"nr_shadow_call_stack",
#endif
"nr_page_table_pages",
+#ifdef CONFIG_SWAP
+ "nr_swapcached",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1619,8 +1622,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
if (is_zone_first_populated(pgdat, zone)) {
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ unsigned long pages = node_page_state_pages(pgdat, i);
+
+ if (vmstat_item_print_in_thp(i))
+ pages /= HPAGE_PMD_NR;
seq_printf(m, "\n %-12s %lu", node_stat_name(i),
- node_page_state_pages(pgdat, i));
+ pages);
}
}
seq_printf(m,
@@ -1740,8 +1747,11 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
v += NR_VM_NUMA_STAT_ITEMS;
#endif
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
v[i] = global_node_page_state_pages(i);
+ if (vmstat_item_print_in_thp(i))
+ v[i] /= HPAGE_PMD_NR;
+ }
v += NR_VM_NODE_STAT_ITEMS;
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
diff --git a/mm/workingset.c b/mm/workingset.c
index 10e96de945b3..cd39902c1062 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -263,10 +263,10 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
VM_BUG_ON_PAGE(!PageLocked(page), page);
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
@@ -461,6 +461,8 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
+ if (!nodes)
+ return SHRINK_EMPTY;
/*
* Approximate a reasonable limit for the nodes
@@ -503,9 +505,6 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
- if (!nodes)
- return SHRINK_EMPTY;
-
if (nodes <= max_nodes)
return 0;
return nodes - max_nodes;
diff --git a/mm/z3fold.c b/mm/z3fold.c
index dacb0d70fa61..c1ccf6bb0ffb 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -413,16 +413,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
if (!slots)
return NULL;
+ memset(zhdr, 0, sizeof(*zhdr));
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
- zhdr->first_chunks = 0;
- zhdr->middle_chunks = 0;
- zhdr->last_chunks = 0;
- zhdr->first_num = 0;
- zhdr->start_middle = 0;
zhdr->cpu = -1;
- zhdr->foreign_handles = 0;
- zhdr->mapped_count = 0;
zhdr->slots = slots;
zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
@@ -541,8 +535,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
spin_unlock(&pool->stale_lock);
}
-static void __attribute__((__unused__))
- release_z3fold_page(struct kref *ref)
+static void release_z3fold_page(struct kref *ref)
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
refcount);
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 953f4a2de1e5..2e3ba91a5072 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -103,6 +103,7 @@ alloated||allocated
allocatote||allocate
allocatrd||allocated
allocte||allocate
+allocted||allocated
allpication||application
alocate||allocate
alogirhtms||algorithms
@@ -339,6 +340,7 @@ comppatible||compatible
compres||compress
compresion||compression
comression||compression
+comunicate||communicate
comunication||communication
conbination||combination
conditionaly||conditionally
@@ -466,6 +468,7 @@ developpment||development
deveolpment||development
devided||divided
deviece||device
+devision||division
diable||disable
dicline||decline
dictionnary||dictionary
@@ -479,6 +482,7 @@ difinition||definition
digial||digital
dimention||dimension
dimesions||dimensions
+diconnected||disconnected
disgest||digest
dispalying||displaying
diplay||display
@@ -518,6 +522,7 @@ downlads||downloads
droped||dropped
droput||dropout
druing||during
+dyanmic||dynamic
dynmaic||dynamic
eanable||enable
eanble||enable
@@ -542,6 +547,7 @@ encrupted||encrypted
encrypiton||encryption
encryptio||encryption
endianess||endianness
+enpoint||endpoint
enhaced||enhanced
enlightnment||enlightenment
enqueing||enqueuing
@@ -566,6 +572,7 @@ estbalishment||establishment
etsablishment||establishment
etsbalishment||establishment
evalution||evaluation
+exeeds||exceeds
excecutable||executable
exceded||exceeded
exceds||exceeds
@@ -574,6 +581,7 @@ excellant||excellent
execeeded||exceeded
execeeds||exceeds
exeed||exceed
+exeeds||exceeds
exeuction||execution
existance||existence
existant||existent
@@ -641,6 +649,7 @@ forwardig||forwarding
frambuffer||framebuffer
framming||framing
framwork||framework
+frequence||frequency
frequncy||frequency
frequancy||frequency
frome||from
@@ -683,10 +692,12 @@ handfull||handful
hanlde||handle
hanled||handled
happend||happened
+hardare||hardware
harware||hardware
havind||having
heirarchically||hierarchically
helpfull||helpful
+heterogenous||heterogeneous
hexdecimal||hexadecimal
hybernate||hibernate
hierachy||hierarchy
@@ -731,6 +742,7 @@ inconsistant||inconsistent
increas||increase
incremeted||incremented
incrment||increment
+incuding||including
inculde||include
indendation||indentation
indended||intended
@@ -741,6 +753,7 @@ indiate||indicate
indicat||indicate
inexpect||inexpected
inferface||interface
+infinit||infinite
infomation||information
informatiom||information
informations||information
@@ -771,6 +784,7 @@ instace||instance
instal||install
instanciate||instantiate
instanciated||instantiated
+instuments||instruments
insufficent||insufficient
inteface||interface
integreated||integrated
@@ -869,12 +883,14 @@ mailformed||malformed
malplaced||misplaced
malplace||misplace
managable||manageable
+managament||management
managment||management
mangement||management
manger||manager
manoeuvering||maneuvering
manufaucturing||manufacturing
mappping||mapping
+maping||mapping
matchs||matches
mathimatical||mathematical
mathimatic||mathematic
@@ -886,6 +902,7 @@ meetign||meeting
memeory||memory
memmber||member
memoery||memory
+memroy||memory
ment||meant
mergable||mergeable
mesage||message
@@ -999,6 +1016,7 @@ overlaping||overlapping
overide||override
overrided||overridden
overriden||overridden
+overrrun||overrun
overun||overrun
overwritting||overwriting
overwriten||overwritten
@@ -1035,6 +1053,7 @@ peforming||performing
peice||piece
pendantic||pedantic
peprocessor||preprocessor
+perfomance||performance
perfoming||performing
perfomring||performing
periperal||peripheral
@@ -1100,6 +1119,7 @@ prodecure||procedure
progamming||programming
progams||programs
progess||progress
+programable||programmable
programers||programmers
programm||program
programms||programs
@@ -1144,6 +1164,7 @@ recieved||received
recieve||receive
reciever||receiver
recieves||receives
+recieving||receiving
recogniced||recognised
recognizeable||recognizable
recommanded||recommended
@@ -1247,6 +1268,7 @@ searchs||searches
secquence||sequence
secund||second
segement||segment
+seleted||selected
semaphone||semaphore
senario||scenario
senarios||scenarios
@@ -1263,6 +1285,7 @@ seqeunce||sequence
seqeuncer||sequencer
seqeuencer||sequencer
sequece||sequence
+sequemce||sequence
sequencial||sequential
serivce||service
serveral||several
@@ -1333,6 +1356,7 @@ suble||subtle
substract||subtract
submited||submitted
submition||submission
+succeded||succeeded
suceed||succeed
succesfully||successfully
succesful||successful
@@ -1353,6 +1377,7 @@ supportin||supporting
suppoted||supported
suppported||supported
suppport||support
+supprot||support
supress||suppress
surpressed||suppressed
surpresses||suppresses
@@ -1401,6 +1426,7 @@ thresold||threshold
throught||through
trackling||tracking
troughput||throughput
+trys||tries
thses||these
tiggers||triggers
tiggered||triggered
@@ -1414,7 +1440,9 @@ traking||tracking
tramsmitted||transmitted
tramsmit||transmit
tranasction||transaction
+tranceiver||transceiver
tranfer||transfer
+tranmission||transmission
transcevier||transceiver
transciever||transceiver
transferd||transferred
@@ -1468,6 +1496,7 @@ unnecesary||unnecessary
unneedingly||unnecessarily
unnsupported||unsupported
unmached||unmatched
+unprecise||imprecise
unregester||unregister
unresgister||unregister
unrgesiter||unregister
@@ -1503,6 +1532,7 @@ varient||variant
vaule||value
verbse||verbose
veify||verify
+veriosn||version
verisons||versions
verison||version
verson||version
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 080a3d6cbd75..80e7b822a432 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -666,7 +666,7 @@ static void add_ignores(struct objtool_file *file)
static const char *uaccess_safe_builtin[] = {
/* KASAN */
"kasan_report",
- "check_memory_region",
+ "kasan_check_range",
/* KASAN out-of-line */
"__asan_loadN_noabort",
"__asan_load1_noabort",