summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-19 05:26:54 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-19 05:26:54 +0300
commit814a2bf957739f367cbebfa1b60237387b72d0ee (patch)
tree8d65c38d14beb8d6d2dc5b9d7f8dbe63c7cad31a
parent237045fc3c67d44088f767dca5a9fa30815eba62 (diff)
parentf9310b2f9a19b7f16c7b1c1558f8b649b9b933c1 (diff)
downloadlinux-814a2bf957739f367cbebfa1b60237387b72d0ee.tar.xz
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton: - a couple of hotfixes - the rest of MM - a new timer slack control in procfs - a couple of procfs fixes - a few misc things - some printk tweaks - lib/ updates, notably to radix-tree. - add my and Nick Piggin's old userspace radix-tree test harness to tools/testing/radix-tree/. Matthew said it was a godsend during the radix-tree work he did. - a few code-size improvements, switching to __always_inline where gcc screwed up. - partially implement character sets in sscanf * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits) sscanf: implement basic character sets lib/bug.c: use common WARN helper param: convert some "on"/"off" users to strtobool lib: add "on"/"off" support to kstrtobool lib: update single-char callers of strtobool() lib: move strtobool() to kstrtobool() include/linux/unaligned: force inlining of byteswap operations include/uapi/linux/byteorder, swab: force inlining of some byteswap operations include/asm-generic/atomic-long.h: force inlining of some atomic_long operations usb: common: convert to use match_string() helper ide: hpt366: convert to use match_string() helper ata: hpt366: convert to use match_string() helper power: ab8500: convert to use match_string() helper power: charger_manager: convert to use match_string() helper drm/edid: convert to use match_string() helper pinctrl: convert to use match_string() helper device property: convert to use match_string() helper lib/string: introduce match_string() helper radix-tree tests: add test for radix_tree_iter_next radix-tree tests: add regression3 test ...
-rw-r--r--Documentation/cgroup-v1/cgroups.txt2
-rw-r--r--Documentation/cgroup-v1/cpusets.txt2
-rw-r--r--Documentation/cgroup-v2.txt25
-rw-r--r--Documentation/filesystems/proc.txt18
-rw-r--r--Documentation/sysctl/vm.txt18
-rw-r--r--Documentation/vm/transhuge.txt22
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/arc/include/asm/hugepage.h3
-rw-r--r--arch/arm/mm/fault.c2
-rw-r--r--arch/arm/mm/mmu.c6
-rw-r--r--arch/arm/mm/pgd.c2
-rw-r--r--arch/arm64/mm/fault.c2
-rw-r--r--arch/arm64/mm/hugetlbpage.c2
-rw-r--r--arch/c6x/Kconfig1
-rw-r--r--arch/ia64/include/asm/io.h1
-rw-r--r--arch/ia64/include/asm/rwsem.h2
-rw-r--r--arch/ia64/mm/hugetlbpage.c2
-rw-r--r--arch/metag/mm/hugetlbpage.c2
-rw-r--r--arch/mips/mm/gup.c2
-rw-r--r--arch/mn10300/Kconfig1
-rw-r--r--arch/mn10300/kernel/fpu-nofpu.c1
-rw-r--r--arch/parisc/mm/hugetlbpage.c2
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/kernel/rtasd.c9
-rw-r--r--arch/powerpc/kernel/traps.c5
-rw-r--r--arch/powerpc/mm/hash_utils_64.c36
-rw-r--r--arch/powerpc/mm/init_32.c8
-rw-r--r--arch/powerpc/mm/mmu_context_hash64.c3
-rw-r--r--arch/powerpc/mm/pgtable_64.c2
-rw-r--r--arch/powerpc/platforms/512x/mpc512x_shared.c2
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-cpu.c10
-rw-r--r--arch/s390/kernel/time.c8
-rw-r--r--arch/s390/kernel/topology.c7
-rw-r--r--arch/sh/mm/hugetlbpage.c2
-rw-r--r--arch/sparc/mm/hugetlbpage.c2
-rw-r--r--arch/tile/mm/hugetlbpage.c2
-rw-r--r--arch/tile/mm/init.c11
-rw-r--r--arch/um/kernel/skas/mmu.c2
-rw-r--r--arch/unicore32/mm/fault.c2
-rw-r--r--arch/unicore32/mm/pgd.c2
-rw-r--r--arch/x86/kernel/aperture_64.c12
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/xtensa/mm/fault.c2
-rw-r--r--crypto/async_tx/async_pq.c2
-rw-r--r--drivers/ata/pata_hpt366.c13
-rw-r--r--drivers/base/property.c12
-rw-r--r--drivers/block/aoe/aoecmd.c4
-rw-r--r--drivers/firmware/broadcom/bcm47xx_nvram.c5
-rw-r--r--drivers/gpu/drm/drm_edid_load.c17
-rw-r--r--drivers/ide/hpt366.c9
-rw-r--r--drivers/net/ethernet/freescale/gianfar.c2
-rw-r--r--drivers/net/ethernet/intel/fm10k/fm10k_main.c2
-rw-r--r--drivers/net/ethernet/intel/igb/igb_main.c2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c2
-rw-r--r--drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_rx.c9
-rw-r--r--drivers/net/ethernet/sun/niu.c2
-rw-r--r--drivers/net/wireless/marvell/mwifiex/debugfs.c10
-rw-r--r--drivers/pinctrl/pinmux.c13
-rw-r--r--drivers/power/ab8500_btemp.c15
-rw-r--r--drivers/power/ab8500_charger.c16
-rw-r--r--drivers/power/ab8500_fg.c15
-rw-r--r--drivers/power/abx500_chargalg.c14
-rw-r--r--drivers/power/charger-manager.c27
-rw-r--r--drivers/usb/common/common.c22
-rw-r--r--drivers/virtio/virtio_balloon.c6
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/cifs/cifs_debug.c56
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/proc/base.c71
-rw-r--r--fs/proc/meminfo.c31
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/select.c8
-rw-r--r--include/asm-generic/atomic-long.h6
-rw-r--r--include/asm-generic/bug.h27
-rw-r--r--include/asm-generic/pgtable.h17
-rw-r--r--include/linux/buffer_head.h10
-rw-r--r--include/linux/compaction.h16
-rw-r--r--include/linux/freezer.h2
-rw-r--r--include/linux/gfp.h37
-rw-r--r--include/linux/hrtimer.h12
-rw-r--r--include/linux/huge_mm.h22
-rw-r--r--include/linux/kernel.h2
-rw-r--r--include/linux/list_bl.h4
-rw-r--r--include/linux/memcontrol.h44
-rw-r--r--include/linux/mm.h68
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/page-flags-layout.h2
-rw-r--r--include/linux/page-flags.h32
-rw-r--r--include/linux/page_ref.h173
-rw-r--r--include/linux/pagemap.h19
-rw-r--r--include/linux/poll.h2
-rw-r--r--include/linux/quicklist.h2
-rw-r--r--include/linux/radix-tree.h28
-rw-r--r--include/linux/rmap.h6
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/sched/sysctl.h21
-rw-r--r--include/linux/string.h8
-rw-r--r--include/linux/tick.h2
-rw-r--r--include/linux/unaligned/access_ok.h24
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/trace/events/compaction.h55
-rw-r--r--include/trace/events/mmflags.h23
-rw-r--r--include/trace/events/page_ref.h134
-rw-r--r--include/uapi/linux/byteorder/big_endian.h24
-rw-r--r--include/uapi/linux/byteorder/little_endian.h24
-rw-r--r--include/uapi/linux/elf-em.h3
-rw-r--r--include/uapi/linux/swab.h10
-rw-r--r--include/uapi/linux/virtio_balloon.h3
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/panic.c41
-rw-r--r--kernel/printk/printk.c140
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/time/hrtimer.c18
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/watchdog.c9
-rw-r--r--lib/bug.c15
-rw-r--r--lib/kstrtox.c64
-rw-r--r--lib/radix-tree.c182
-rw-r--r--lib/string.c45
-rw-r--r--lib/vsprintf.c59
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Kconfig.debug13
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/bootmem.c7
-rw-r--r--mm/compaction.c232
-rw-r--r--mm/debug.c2
-rw-r--r--mm/debug_page_ref.c54
-rw-r--r--mm/dmapool.c18
-rw-r--r--mm/filemap.c55
-rw-r--r--mm/huge_memory.c337
-rw-r--r--mm/hugetlb.c5
-rw-r--r--mm/internal.h13
-rw-r--r--mm/kasan/report.c6
-rw-r--r--mm/kmemcheck.c3
-rw-r--r--mm/kmemleak-test.c2
-rw-r--r--mm/kmemleak.c32
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c191
-rw-r--r--mm/memory-failure.c52
-rw-r--r--mm/memory.c25
-rw-r--r--mm/memory_hotplug.c54
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/mempool.c20
-rw-r--r--mm/migrate.c33
-rw-r--r--mm/mm_init.c7
-rw-r--r--mm/mmap.c133
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nobootmem.c4
-rw-r--r--mm/nommu.c117
-rw-r--r--mm/oom_kill.c16
-rw-r--r--mm/page_alloc.c192
-rw-r--r--mm/page_io.c22
-rw-r--r--mm/page_owner.c5
-rw-r--r--mm/percpu-km.c6
-rw-r--r--mm/percpu.c43
-rw-r--r--mm/pgtable-generic.c14
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/rmap.c71
-rw-r--r--mm/shmem.c49
-rw-r--r--mm/slab.c89
-rw-r--r--mm/slab.h30
-rw-r--r--mm/slab_common.c14
-rw-r--r--mm/slub.c31
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c21
-rw-r--r--mm/swap_cgroup.c5
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/userfaultfd.c3
-rw-r--r--mm/util.c124
-rw-r--r--mm/vmalloc.c31
-rw-r--r--mm/vmscan.c169
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/zsmalloc.c29
-rw-r--r--net/core/sock.c2
-rw-r--r--sound/drivers/pcsp/pcsp.c9
-rw-r--r--tools/testing/radix-tree/.gitignore2
-rw-r--r--tools/testing/radix-tree/Makefile19
-rw-r--r--tools/testing/radix-tree/find_next_bit.c57
-rw-r--r--tools/testing/radix-tree/linux.c60
-rw-r--r--tools/testing/radix-tree/linux/bitops.h150
-rw-r--r--tools/testing/radix-tree/linux/bitops/__ffs.h43
-rw-r--r--tools/testing/radix-tree/linux/bitops/ffs.h41
-rw-r--r--tools/testing/radix-tree/linux/bitops/ffz.h12
-rw-r--r--tools/testing/radix-tree/linux/bitops/find.h13
-rw-r--r--tools/testing/radix-tree/linux/bitops/fls.h41
-rw-r--r--tools/testing/radix-tree/linux/bitops/fls64.h14
-rw-r--r--tools/testing/radix-tree/linux/bitops/hweight.h11
-rw-r--r--tools/testing/radix-tree/linux/bitops/le.h53
-rw-r--r--tools/testing/radix-tree/linux/bitops/non-atomic.h111
-rw-r--r--tools/testing/radix-tree/linux/bug.h1
-rw-r--r--tools/testing/radix-tree/linux/cpu.h34
-rw-r--r--tools/testing/radix-tree/linux/export.h2
-rw-r--r--tools/testing/radix-tree/linux/gfp.h10
-rw-r--r--tools/testing/radix-tree/linux/kernel.h35
-rw-r--r--tools/testing/radix-tree/linux/kmemleak.h1
-rw-r--r--tools/testing/radix-tree/linux/mempool.h16
-rw-r--r--tools/testing/radix-tree/linux/notifier.h8
-rw-r--r--tools/testing/radix-tree/linux/percpu.h7
-rw-r--r--tools/testing/radix-tree/linux/preempt.h4
-rw-r--r--tools/testing/radix-tree/linux/radix-tree.h1
-rw-r--r--tools/testing/radix-tree/linux/rcupdate.h9
-rw-r--r--tools/testing/radix-tree/linux/slab.h28
-rw-r--r--tools/testing/radix-tree/linux/types.h28
-rw-r--r--tools/testing/radix-tree/main.c272
-rw-r--r--tools/testing/radix-tree/rcupdate.c86
-rw-r--r--tools/testing/radix-tree/regression.h8
-rw-r--r--tools/testing/radix-tree/regression1.c220
-rw-r--r--tools/testing/radix-tree/regression2.c126
-rw-r--r--tools/testing/radix-tree/regression3.c117
-rw-r--r--tools/testing/radix-tree/tag_check.c332
-rw-r--r--tools/testing/radix-tree/test.c219
-rw-r--r--tools/testing/radix-tree/test.h40
-rw-r--r--tools/vm/page-types.c133
225 files changed, 5048 insertions, 1915 deletions
diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt
index c6256ae9885b..947e6fe31ef9 100644
--- a/Documentation/cgroup-v1/cgroups.txt
+++ b/Documentation/cgroup-v1/cgroups.txt
@@ -8,7 +8,7 @@ Original copyright statements from cpusets.txt:
Portions Copyright (C) 2004 BULL SA.
Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <clameter@sgi.com>
+Modified by Christoph Lameter <cl@linux.com>
CONTENTS:
=========
diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt
index fdf7dff3f607..e5cdcd445615 100644
--- a/Documentation/cgroup-v1/cpusets.txt
+++ b/Documentation/cgroup-v1/cpusets.txt
@@ -6,7 +6,7 @@ Written by Simon.Derr@bull.net
Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <clameter@sgi.com>
+Modified by Christoph Lameter <cl@linux.com>
Modified by Paul Menage <menage@google.com>
Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index ff49cf901148..8f1329a5f700 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -843,6 +843,15 @@ PAGE_SIZE multiple when read back.
Amount of memory used to cache filesystem data,
including tmpfs and shared memory.
+ kernel_stack
+
+ Amount of memory allocated to kernel stacks.
+
+ slab
+
+ Amount of memory used for storing in-kernel data
+ structures.
+
sock
Amount of memory used in network transmission buffers
@@ -871,6 +880,16 @@ PAGE_SIZE multiple when read back.
on the internal memory management lists used by the
page reclaim algorithm
+ slab_reclaimable
+
+ Part of "slab" that might be reclaimed, such as
+ dentries and inodes.
+
+ slab_unreclaimable
+
+ Part of "slab" that cannot be reclaimed on memory
+ pressure.
+
pgfault
Total number of page faults incurred
@@ -1368,6 +1387,12 @@ system than killing the group. Otherwise, memory.max is there to
limit this type of spillover and ultimately contain buggy or even
malicious applications.
+Setting the original memory.limit_in_bytes below the current usage was
+subject to a race condition, where concurrent charges could cause the
+limit setting to fail. memory.max on the other hand will first set the
+limit to prevent new charges, and then reclaim and OOM kill until the
+new limit is met - or the task writing to memory.max is killed.
+
The combined memory+swap accounting and limiting is replaced by real
control over swap space.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 843b045b4069..7f5607a089b4 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -43,6 +43,7 @@ Table of Contents
3.7 /proc/<pid>/task/<tid>/children - Information about task children
3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file
3.9 /proc/<pid>/map_files - Information about memory mapped files
+ 3.10 /proc/<pid>/timerslack_ns - Task timerslack value
4 Configuring procfs
4.1 Mount options
@@ -1862,6 +1863,23 @@ time one can open(2) mappings from the listings of two processes and
comparing their inode numbers to figure out which anonymous memory areas
are actually shared.
+3.10 /proc/<pid>/timerslack_ns - Task timerslack value
+---------------------------------------------------------
+This file provides the value of the task's timerslack value in nanoseconds.
+This value specifies a amount of time that normal timers may be deferred
+in order to coalesce timers and avoid unnecessary wakeups.
+
+This allows a task's interactivity vs power consumption trade off to be
+adjusted.
+
+Writing 0 to the file will set the tasks timerslack to the default value.
+
+Valid values are from 0 - ULLONG_MAX
+
+An application setting the value must have PTRACE_MODE_ATTACH_FSCREDS level
+permissions on the task specified to change its timerslack_ns value.
+
+
------------------------------------------------------------------------------
Configuring procfs
------------------------------------------------------------------------------
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 89a887c76629..cb0368459da3 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable
directory and inode objects. With vfs_cache_pressure=1000, it will look for
ten times more freeable objects than there are.
+=============================================================
+
+watermark_scale_factor:
+
+This factor controls the aggressiveness of kswapd. It defines the
+amount of memory left in a node/system before kswapd is woken up and
+how much memory needs to be free before kswapd goes back to sleep.
+
+The unit is in fractions of 10,000. The default value of 10 means the
+distances between watermarks are 0.1% of the available memory in the
+node/system. The maximum value is 1000, or 10% of memory.
+
+A high rate of threads entering direct reclaim (allocstall) or kswapd
+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
+that the number of free pages kswapd maintains for latency reasons is
+too small for the allocation bursts occurring in the system. This knob
+can then be used to tune kswapd aggressiveness accordingly.
+
==============================================================
zone_reclaim_mode:
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 21cf34f3ddb2..d9cb65cf5cfd 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -113,9 +113,26 @@ guaranteed, but it may be more likely in case the allocation is for a
MADV_HUGEPAGE region.
echo always >/sys/kernel/mm/transparent_hugepage/defrag
+echo defer >/sys/kernel/mm/transparent_hugepage/defrag
echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
echo never >/sys/kernel/mm/transparent_hugepage/defrag
+"always" means that an application requesting THP will stall on allocation
+failure and directly reclaim pages and compact memory in an effort to
+allocate a THP immediately. This may be desirable for virtual machines
+that benefit heavily from THP use and are willing to delay the VM start
+to utilise them.
+
+"defer" means that an application will wake kswapd in the background
+to reclaim pages and wake kcompact to compact memory so that THP is
+available in the near future. It's the responsibility of khugepaged
+to then install the THP pages later.
+
+"madvise" will enter direct reclaim like "always" but only for regions
+that are have used madvise(MADV_HUGEPAGE). This is the default behaviour.
+
+"never" should be self-explanatory.
+
By default kernel tries to use huge zero page on read page fault.
It's possible to disable huge zero page by writing 0 or enable it
back by writing 1:
@@ -229,6 +246,11 @@ thp_split_page is incremented every time a huge page is split into base
thp_split_page_failed is is incremented if kernel fails to split huge
page. This can happen if the page was pinned by somebody.
+thp_deferred_split_page is incremented when a huge page is put onto split
+ queue. This happens when a huge page is partially unmapped and
+ splitting it would free up some memory. Pages on split queue are
+ going to be split under memory pressure.
+
thp_split_pmd is incremented every time a PMD split into table of PTEs.
This can happen, for instance, when application calls mprotect() or
munmap() on part of huge page. It doesn't split huge page, only
diff --git a/MAINTAINERS b/MAINTAINERS
index 1c6d7781812e..2933d90512a3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8498,7 +8498,7 @@ F: include/crypto/pcrypt.h
PER-CPU MEMORY ALLOCATOR
M: Tejun Heo <tj@kernel.org>
-M: Christoph Lameter <cl@linux-foundation.org>
+M: Christoph Lameter <cl@linux.com>
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git
S: Maintained
F: include/linux/percpu*.h
@@ -11296,7 +11296,6 @@ F: include/linux/cdrom.h
F: include/uapi/linux/cdrom.h
UNISYS S-PAR DRIVERS
-M: Benjamin Romer <benjamin.romer@unisys.com>
M: David Kershner <david.kershner@unisys.com>
L: sparmaintainer@unisys.com (Unisys internal)
S: Supported
diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h
index c5094de86403..7afe3356b770 100644
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -30,19 +30,16 @@ static inline pmd_t pte_pmd(pte_t pte)
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkhuge(pmd) pte_pmd(pte_mkhuge(pmd_pte(pmd)))
#define pmd_mknotpresent(pmd) pte_pmd(pte_mknotpresent(pmd_pte(pmd)))
-#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
-#define pmd_special(pmd) pte_special(pmd_pte(pmd))
#define mk_pmd(page, prot) pte_pmd(mk_pte(page, prot))
#define pmd_trans_huge(pmd) (pmd_val(pmd) & _PAGE_HW_SZ)
-#define pmd_trans_splitting(pmd) (pmd_trans_huge(pmd) && pmd_special(pmd))
#define pfn_pmd(pfn, prot) (__pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)))
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index daafcf121ce0..ad5841856007 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -346,7 +346,7 @@ retry:
up_read(&mm->mmap_sem);
/*
- * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
+ * Handle the "normal" case first - VM_FAULT_MAJOR
*/
if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
return 0;
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 434d76f0b363..88fbe0d23ca6 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -732,7 +732,7 @@ static void *__init late_alloc(unsigned long sz)
return ptr;
}
-static pte_t * __init pte_alloc(pmd_t *pmd, unsigned long addr,
+static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr,
unsigned long prot,
void *(*alloc)(unsigned long sz))
{
@@ -747,7 +747,7 @@ static pte_t * __init pte_alloc(pmd_t *pmd, unsigned long addr,
static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr,
unsigned long prot)
{
- return pte_alloc(pmd, addr, prot, early_alloc);
+ return arm_pte_alloc(pmd, addr, prot, early_alloc);
}
static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
@@ -756,7 +756,7 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
void *(*alloc)(unsigned long sz),
bool ng)
{
- pte_t *pte = pte_alloc(pmd, addr, type->prot_l1, alloc);
+ pte_t *pte = arm_pte_alloc(pmd, addr, type->prot_l1, alloc);
do {
set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)),
ng ? PTE_EXT_NG : 0);
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index e683db1b90a3..b8d477321730 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -80,7 +80,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
if (!new_pmd)
goto no_pmd;
- new_pte = pte_alloc_map(mm, NULL, new_pmd, 0);
+ new_pte = pte_alloc_map(mm, new_pmd, 0);
if (!new_pte)
goto no_pte;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 0077674b2b38..95df28bc875f 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -304,7 +304,7 @@ retry:
up_read(&mm->mmap_sem);
/*
- * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
+ * Handle the "normal" case first - VM_FAULT_MAJOR
*/
if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
VM_FAULT_BADACCESS))))
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index da30529bb1f6..589fd28e1fb5 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -124,7 +124,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
* will be no pte_unmap() to correspond with this
* pte_alloc_map().
*/
- pte = pte_alloc_map(mm, NULL, pmd, addr);
+ pte = pte_alloc_map(mm, pmd, addr);
} else if (sz == PMD_SIZE) {
if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
pud_none(*pud))
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 79049d432d3c..5aa8ea8bad2d 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -36,6 +36,7 @@ config GENERIC_HWEIGHT
config GENERIC_BUG
def_bool y
+ depends on BUG
config C6X_BIG_KERNEL
bool "Build a big kernel"
diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h
index a865d2a04f75..5de673ac9cb1 100644
--- a/arch/ia64/include/asm/io.h
+++ b/arch/ia64/include/asm/io.h
@@ -433,6 +433,7 @@ static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned lo
return ioremap(phys_addr, size);
}
#define ioremap_cache ioremap_cache
+#define ioremap_uc ioremap_nocache
/*
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 3027e7516d85..ce112472bdd6 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -3,7 +3,7 @@
*
* Copyright (C) 2003 Ken Chen <kenneth.w.chen@intel.com>
* Copyright (C) 2003 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 2005 Christoph Lameter <clameter@sgi.com>
+ * Copyright (C) 2005 Christoph Lameter <cl@linux.com>
*
* Based on asm-i386/rwsem.h and other architecture implementation.
*
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index f50d4b3f501a..85de86d36fdf 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
if (pud) {
pmd = pmd_alloc(mm, pud, taddr);
if (pmd)
- pte = pte_alloc_map(mm, NULL, pmd, taddr);
+ pte = pte_alloc_map(mm, pmd, taddr);
}
return pte;
}
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 53f0f6c47027..b38700ae4e84 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -67,7 +67,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pgd = pgd_offset(mm, addr);
pud = pud_offset(pgd, addr);
pmd = pmd_offset(pud, addr);
- pte = pte_alloc_map(mm, NULL, pmd, addr);
+ pte = pte_alloc_map(mm, pmd, addr);
pgd->pgd &= ~_PAGE_SZ_MASK;
pgd->pgd |= _PAGE_SZHUGE;
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 1afd87c999b0..6cdffc76735c 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -64,7 +64,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
{
VM_BUG_ON(page != compound_head(page));
VM_BUG_ON(page_count(page) == 0);
- atomic_add(nr, &page->_count);
+ page_ref_add(page, nr);
SetPageReferenced(page);
}
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 10607f0d2bcd..06ddb5501ab1 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -53,6 +53,7 @@ config GENERIC_HWEIGHT
config GENERIC_BUG
def_bool y
+ depends on BUG
config QUICKLIST
def_bool y
diff --git a/arch/mn10300/kernel/fpu-nofpu.c b/arch/mn10300/kernel/fpu-nofpu.c
index 31c765b92c5d..8d0e041aa798 100644
--- a/arch/mn10300/kernel/fpu-nofpu.c
+++ b/arch/mn10300/kernel/fpu-nofpu.c
@@ -9,6 +9,7 @@
* 2 of the Licence, or (at your option) any later version.
*/
#include <asm/fpu.h>
+#include <asm/elf.h>
/*
* handle an FPU operational exception
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index 54ba39262b82..5d6eea925cf4 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -63,7 +63,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
if (pud) {
pmd = pmd_alloc(mm, pud, addr);
if (pmd)
- pte = pte_alloc_map(mm, NULL, pmd, addr);
+ pte = pte_alloc_map(mm, pmd, addr);
}
return pte;
}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 832cc461d0af..a030e5ecb10b 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -158,6 +158,7 @@ config PPC
select ARCH_HAS_DEVMEM_IS_ALLOWED
select HAVE_ARCH_SECCOMP_FILTER
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 5a2c049c1c61..aa610ce8742f 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -49,7 +49,7 @@ static unsigned int rtas_error_log_buffer_max;
static unsigned int event_scan;
static unsigned int rtas_event_scan_rate;
-static int full_rtas_msgs = 0;
+static bool full_rtas_msgs;
/* Stop logging to nvram after first fatal error */
static int logging_enabled; /* Until we initialize everything,
@@ -592,11 +592,6 @@ __setup("surveillance=", surveillance_setup);
static int __init rtasmsgs_setup(char *str)
{
- if (strcmp(str, "on") == 0)
- full_rtas_msgs = 1;
- else if (strcmp(str, "off") == 0)
- full_rtas_msgs = 0;
-
- return 1;
+ return (kstrtobool(str, &full_rtas_msgs) == 0);
}
__setup("rtasmsgs=", rtasmsgs_setup);
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index b6becc795bb5..33c47fcc455a 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -203,9 +203,8 @@ static int __kprobes __die(const char *str, struct pt_regs *regs, long err)
#ifdef CONFIG_SMP
printk("SMP NR_CPUS=%d ", NR_CPUS);
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC ");
-#endif
+ if (debug_pagealloc_enabled())
+ printk("DEBUG_PAGEALLOC ");
#ifdef CONFIG_NUMA
printk("NUMA ");
#endif
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index ba59d5977f34..1005281be9a6 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -255,8 +255,10 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
if (ret < 0)
break;
+
#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((paddr >> PAGE_SHIFT) < linear_map_hash_count)
+ if (debug_pagealloc_enabled() &&
+ (paddr >> PAGE_SHIFT) < linear_map_hash_count)
linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
#endif /* CONFIG_DEBUG_PAGEALLOC */
}
@@ -512,17 +514,17 @@ static void __init htab_init_page_sizes(void)
if (mmu_has_feature(MMU_FTR_16M_PAGE))
memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
sizeof(mmu_psize_defaults_gp));
- found:
-#ifndef CONFIG_DEBUG_PAGEALLOC
- /*
- * Pick a size for the linear mapping. Currently, we only support
- * 16M, 1M and 4K which is the default
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift)
- mmu_linear_psize = MMU_PAGE_16M;
- else if (mmu_psize_defs[MMU_PAGE_1M].shift)
- mmu_linear_psize = MMU_PAGE_1M;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
+found:
+ if (!debug_pagealloc_enabled()) {
+ /*
+ * Pick a size for the linear mapping. Currently, we only
+ * support 16M, 1M and 4K which is the default
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ mmu_linear_psize = MMU_PAGE_16M;
+ else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+ mmu_linear_psize = MMU_PAGE_1M;
+ }
#ifdef CONFIG_PPC_64K_PAGES
/*
@@ -721,10 +723,12 @@ static void __init htab_initialize(void)
prot = pgprot_val(PAGE_KERNEL);
#ifdef CONFIG_DEBUG_PAGEALLOC
- linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
- linear_map_hash_slots = __va(memblock_alloc_base(linear_map_hash_count,
- 1, ppc64_rma_size));
- memset(linear_map_hash_slots, 0, linear_map_hash_count);
+ if (debug_pagealloc_enabled()) {
+ linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ linear_map_hash_slots = __va(memblock_alloc_base(
+ linear_map_hash_count, 1, ppc64_rma_size));
+ memset(linear_map_hash_slots, 0, linear_map_hash_count);
+ }
#endif /* CONFIG_DEBUG_PAGEALLOC */
/* On U3 based machines, we need to reserve the DART area and
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index a10be665b645..c2b771614d4f 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -112,10 +112,10 @@ void __init MMU_setup(void)
if (strstr(boot_command_line, "noltlbs")) {
__map_without_ltlbs = 1;
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
- __map_without_bats = 1;
- __map_without_ltlbs = 1;
-#endif
+ if (debug_pagealloc_enabled()) {
+ __map_without_bats = 1;
+ __map_without_ltlbs = 1;
+ }
}
/*
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index 4e4efbc2658e..9ca6fe16cb29 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -118,8 +118,7 @@ static void destroy_pagetable_page(struct mm_struct *mm)
/* drop all the pending references */
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
- if (!count) {
+ if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) {
pgtable_page_dtor(page);
free_hot_cold_page(page, 0);
}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index cdf2123d46db..d9cc66cbdbb7 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -403,7 +403,7 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
* count.
*/
if (likely(!mm->context.pte_frag)) {
- atomic_set(&page->_count, PTE_FRAG_NR);
+ set_page_count(page, PTE_FRAG_NR);
mm->context.pte_frag = ret + PTE_FRAG_SIZE;
}
spin_unlock(&mm->page_table_lock);
diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index 711f3d352af7..452da2391153 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -188,7 +188,7 @@ static struct fsl_diu_shared_fb __attribute__ ((__aligned__(8))) diu_shared_fb;
static inline void mpc512x_free_bootmem(struct page *page)
{
BUG_ON(PageTail(page));
- BUG_ON(atomic_read(&page->_count) > 1);
+ BUG_ON(page_ref_count(page) > 1);
free_reserved_page(page);
}
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 32274f72fe3f..282837a1d74b 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -47,20 +47,14 @@ static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
-static int cede_offline_enabled __read_mostly = 1;
+static bool cede_offline_enabled __read_mostly = true;
/*
* Enable/disable cede_offline when available.
*/
static int __init setup_cede_offline(char *str)
{
- if (!strcmp(str, "off"))
- cede_offline_enabled = 0;
- else if (!strcmp(str, "on"))
- cede_offline_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &cede_offline_enabled) == 0);
}
__setup("cede_offline=", setup_cede_offline);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index c4e5f183f225..9409d32f285e 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -1432,7 +1432,7 @@ device_initcall(etr_init_sysfs);
/*
* Server Time Protocol (STP) code.
*/
-static int stp_online;
+static bool stp_online;
static struct stp_sstpi stp_info;
static void *stp_page;
@@ -1443,11 +1443,7 @@ static struct timer_list stp_timer;
static int __init early_parse_stp(char *p)
{
- if (strncmp(p, "off", 3) == 0)
- stp_online = 0;
- else if (strncmp(p, "on", 2) == 0)
- stp_online = 1;
- return 0;
+ return kstrtobool(p, &stp_online);
}
early_param("stp", early_parse_stp);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 40b8102fdadb..64298a867589 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -37,7 +37,7 @@ static void set_topology_timer(void);
static void topology_work_fn(struct work_struct *work);
static struct sysinfo_15_1_x *tl_info;
-static int topology_enabled = 1;
+static bool topology_enabled = true;
static DECLARE_WORK(topology_work, topology_work_fn);
/*
@@ -444,10 +444,7 @@ static const struct cpumask *cpu_book_mask(int cpu)
static int __init early_parse_topology(char *p)
{
- if (strncmp(p, "off", 3))
- return 0;
- topology_enabled = 0;
- return 0;
+ return kstrtobool(p, &topology_enabled);
}
early_param("topology", early_parse_topology);
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6385f60209b6..cc948db74878 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
if (pud) {
pmd = pmd_alloc(mm, pud, addr);
if (pmd)
- pte = pte_alloc_map(mm, NULL, pmd, addr);
+ pte = pte_alloc_map(mm, pmd, addr);
}
}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 131eaf4ad7f5..4977800e9770 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -146,7 +146,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
if (pud) {
pmd = pmd_alloc(mm, pud, addr);
if (pmd)
- pte = pte_alloc_map(mm, NULL, pmd, addr);
+ pte = pte_alloc_map(mm, pmd, addr);
}
return pte;
}
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index c034dc3fe2d4..e212c64682c5 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -77,7 +77,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
else {
if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE])
panic("Unexpected page size %#lx\n", sz);
- return pte_alloc_map(mm, NULL, pmd, addr);
+ return pte_alloc_map(mm, pmd, addr);
}
}
#else
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index d4e1fc41d06d..a0582b7f41d3 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -896,17 +896,15 @@ void __init pgtable_cache_init(void)
panic("pgtable_cache_init(): Cannot create pgd cache");
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static long __write_once initfree;
-#else
static long __write_once initfree = 1;
-#endif
+static bool __write_once set_initfree_done;
/* Select whether to free (1) or mark unusable (0) the __init pages. */
static int __init set_initfree(char *str)
{
long val;
if (kstrtol(str, 0, &val) == 0) {
+ set_initfree_done = true;
initfree = val;
pr_info("initfree: %s free init pages\n",
initfree ? "will" : "won't");
@@ -919,6 +917,11 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
unsigned long addr = (unsigned long) begin;
+ /* Prefer user request first */
+ if (!set_initfree_done) {
+ if (debug_pagealloc_enabled())
+ initfree = 0;
+ }
if (kdata_huge && !initfree) {
pr_warn("Warning: ignoring initfree=0: incompatible with kdata=huge\n");
initfree = 1;
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 9591a66aa5c5..3943e9d7d13d 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
if (!pmd)
goto out_pmd;
- pte = pte_alloc_map(mm, NULL, pmd, proc);
+ pte = pte_alloc_map(mm, pmd, proc);
if (!pte)
goto out_pte;
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index afccef5529cc..2ec3d3adcefc 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -276,7 +276,7 @@ retry:
up_read(&mm->mmap_sem);
/*
- * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
+ * Handle the "normal" case first - VM_FAULT_MAJOR
*/
if (likely(!(fault &
(VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c
index 2ade20d8eab3..c572a28c76c9 100644
--- a/arch/unicore32/mm/pgd.c
+++ b/arch/unicore32/mm/pgd.c
@@ -54,7 +54,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
if (!new_pmd)
goto no_pmd;
- new_pte = pte_alloc_map(mm, NULL, new_pmd, 0);
+ new_pte = pte_alloc_map(mm, new_pmd, 0);
if (!new_pte)
goto no_pte;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e85f713641d..0a2bb1f62e72 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
return 0;
}
-static int gart_fix_e820 __initdata = 1;
+static bool gart_fix_e820 __initdata = true;
static int __init parse_gart_mem(char *p)
{
- if (!p)
- return -EINVAL;
-
- if (!strncmp(p, "off", 3))
- gart_fix_e820 = 0;
- else if (!strncmp(p, "on", 2))
- gart_fix_e820 = 1;
-
- return 0;
+ return kstrtobool(p, &gart_fix_e820);
}
early_param("gart_fix_e820", parse_gart_mem);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 91a4496db434..e72a07f20b05 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -135,7 +135,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
pmd = pmd_alloc(&tboot_mm, pud, vaddr);
if (!pmd)
return -1;
- pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
+ pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
if (!pte)
return -1;
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d8a798d8bf50..f8d0b5e8bdfd 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -131,7 +131,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
{
VM_BUG_ON_PAGE(page != compound_head(page), page);
VM_BUG_ON_PAGE(page_count(page) == 0, page);
- atomic_add(nr, &page->_count);
+ page_ref_add(page, nr);
SetPageReferenced(page);
}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index c9784c1b18d8..7f4a1fdb1502 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -146,7 +146,7 @@ good_area:
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (flags & VM_FAULT_MAJOR)
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
- else if (flags & VM_FAULT_MINOR)
+ else
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
return;
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index c0748bbd4c08..08b3ac68952b 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -444,7 +444,7 @@ static int __init async_pq_init(void)
static void __exit async_pq_exit(void)
{
- put_page(pq_scribble_page);
+ __free_page(pq_scribble_page);
}
module_init(async_pq_init);
diff --git a/drivers/ata/pata_hpt366.c b/drivers/ata/pata_hpt366.c
index 0038dc4c06c7..e5fb7525a5df 100644
--- a/drivers/ata/pata_hpt366.c
+++ b/drivers/ata/pata_hpt366.c
@@ -176,17 +176,14 @@ static int hpt_dma_blacklisted(const struct ata_device *dev, char *modestr,
const char * const list[])
{
unsigned char model_num[ATA_ID_PROD_LEN + 1];
- int i = 0;
+ int i;
ata_id_c_string(dev->id, model_num, ATA_ID_PROD, sizeof(model_num));
- while (list[i] != NULL) {
- if (!strcmp(list[i], model_num)) {
- pr_warn("%s is not supported for %s\n",
- modestr, list[i]);
- return 1;
- }
- i++;
+ i = match_string(list, -1, model_num);
+ if (i >= 0) {
+ pr_warn("%s is not supported for %s\n", modestr, list[i]);
+ return 1;
}
return 0;
}
diff --git a/drivers/base/property.c b/drivers/base/property.c
index 76628a7b45f1..9b1a65debd49 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -651,7 +651,7 @@ int fwnode_property_match_string(struct fwnode_handle *fwnode,
const char *propname, const char *string)
{
const char **values;
- int nval, ret, i;
+ int nval, ret;
nval = fwnode_property_read_string_array(fwnode, propname, NULL, 0);
if (nval < 0)
@@ -668,13 +668,9 @@ int fwnode_property_match_string(struct fwnode_handle *fwnode,
if (ret < 0)
goto out;
- ret = -ENODATA;
- for (i = 0; i < nval; i++) {
- if (!strcmp(values[i], string)) {
- ret = i;
- break;
- }
- }
+ ret = match_string(values, nval, string);
+ if (ret < 0)
+ ret = -ENODATA;
out:
kfree(values);
return ret;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index d048d2009e89..437b3a822f44 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -875,7 +875,7 @@ bio_pageinc(struct bio *bio)
* compound pages is no longer allowed by the kernel.
*/
page = compound_head(bv.bv_page);
- atomic_inc(&page->_count);
+ page_ref_inc(page);
}
}
@@ -888,7 +888,7 @@ bio_pagedec(struct bio *bio)
bio_for_each_segment(bv, bio, iter) {
page = compound_head(bv.bv_page);
- atomic_dec(&page->_count);
+ page_ref_dec(page);
}
}
diff --git a/drivers/firmware/broadcom/bcm47xx_nvram.c b/drivers/firmware/broadcom/bcm47xx_nvram.c
index 0c2f0a61b0ea..0b631e5b5b84 100644
--- a/drivers/firmware/broadcom/bcm47xx_nvram.c
+++ b/drivers/firmware/broadcom/bcm47xx_nvram.c
@@ -94,15 +94,14 @@ static int nvram_find_and_copy(void __iomem *iobase, u32 lim)
found:
__ioread32_copy(nvram_buf, header, sizeof(*header) / 4);
- header = (struct nvram_header *)nvram_buf;
- nvram_len = header->len;
+ nvram_len = ((struct nvram_header *)(nvram_buf))->len;
if (nvram_len > size) {
pr_err("The nvram size according to the header seems to be bigger than the partition on flash\n");
nvram_len = size;
}
if (nvram_len >= NVRAM_SPACE) {
pr_err("nvram on flash (%i bytes) is bigger than the reserved space in memory, will just copy the first %i bytes\n",
- header->len, NVRAM_SPACE - 1);
+ nvram_len, NVRAM_SPACE - 1);
nvram_len = NVRAM_SPACE - 1;
}
/* proceed reading data after header */
diff --git a/drivers/gpu/drm/drm_edid_load.c b/drivers/gpu/drm/drm_edid_load.c
index 698b8c3b09d9..9a401aed98e0 100644
--- a/drivers/gpu/drm/drm_edid_load.c
+++ b/drivers/gpu/drm/drm_edid_load.c
@@ -170,16 +170,11 @@ static void *edid_load(struct drm_connector *connector, const char *name,
int i, valid_extensions = 0;
bool print_bad_edid = !connector->bad_edid_counter || (drm_debug & DRM_UT_KMS);
- builtin = 0;
- for (i = 0; i < GENERIC_EDIDS; i++) {
- if (strcmp(name, generic_edid_name[i]) == 0) {
- fwdata = generic_edid[i];
- fwsize = sizeof(generic_edid[i]);
- builtin = 1;
- break;
- }
- }
- if (!builtin) {
+ builtin = match_string(generic_edid_name, GENERIC_EDIDS, name);
+ if (builtin >= 0) {
+ fwdata = generic_edid[builtin];
+ fwsize = sizeof(generic_edid[builtin]);
+ } else {
struct platform_device *pdev;
int err;
@@ -252,7 +247,7 @@ static void *edid_load(struct drm_connector *connector, const char *name,
}
DRM_INFO("Got %s EDID base block and %d extension%s from "
- "\"%s\" for connector \"%s\"\n", builtin ? "built-in" :
+ "\"%s\" for connector \"%s\"\n", (builtin >= 0) ? "built-in" :
"external", valid_extensions, valid_extensions == 1 ? "" : "s",
name, connector_name);
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index 696b6c1ec940..f94baadbf424 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -531,14 +531,9 @@ static const struct hpt_info hpt371n = {
.timings = &hpt37x_timings
};
-static int check_in_drive_list(ide_drive_t *drive, const char **list)
+static bool check_in_drive_list(ide_drive_t *drive, const char **list)
{
- char *m = (char *)&drive->id[ATA_ID_PROD];
-
- while (*list)
- if (!strcmp(*list++, m))
- return 1;
- return 0;
+ return match_string(list, -1, (char *)&drive->id[ATA_ID_PROD]) >= 0;
}
static struct hpt_info *hpt3xx_get_info(struct device *dev)
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index b9ecf197ad11..f21b2c479780 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -2944,7 +2944,7 @@ static bool gfar_add_rx_frag(struct gfar_rx_buff *rxb, u32 lstatus,
/* change offset to the other half */
rxb->page_offset ^= GFAR_RXB_TRUESIZE;
- atomic_inc(&page->_count);
+ page_ref_inc(page);
return true;
}
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index b243c3cbe68f..b4547ebed774 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -243,7 +243,7 @@ static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer,
/* Even if we own the page, we are not allowed to use atomic_set()
* This would break get_page_unless_zero() users.
*/
- atomic_inc(&page->_count);
+ page_ref_inc(page);
return true;
}
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 31e5f3942839..5b4ad1ad4d5f 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6630,7 +6630,7 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
/* Even if we own the page, we are not allowed to use atomic_set()
* This would break get_page_unless_zero() users.
*/
- atomic_inc(&page->_count);
+ page_ref_inc(page);
return true;
}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c4003a88bbf6..e6035ff6b861 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1942,7 +1942,7 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
/* Even if we own the page, we are not allowed to use atomic_set()
* This would break get_page_unless_zero() users.
*/
- atomic_inc(&page->_count);
+ page_ref_inc(page);
return true;
}
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 3558f019b631..0ea14c0a2e74 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -837,7 +837,7 @@ add_tail_frag:
/* Even if we own the page, we are not allowed to use atomic_set()
* This would break get_page_unless_zero() users.
*/
- atomic_inc(&page->_count);
+ page_ref_inc(page);
return true;
}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 41440b2b20a3..86bcfe510e4e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -82,8 +82,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
/* Not doing get_page() for each frag is a big win
* on asymetric workloads. Note we can not use atomic_set().
*/
- atomic_add(page_alloc->page_size / frag_info->frag_stride - 1,
- &page->_count);
+ page_ref_add(page, page_alloc->page_size / frag_info->frag_stride - 1);
return 0;
}
@@ -127,7 +126,7 @@ out:
dma_unmap_page(priv->ddev, page_alloc[i].dma,
page_alloc[i].page_size, PCI_DMA_FROMDEVICE);
page = page_alloc[i].page;
- atomic_set(&page->_count, 1);
+ set_page_count(page, 1);
put_page(page);
}
}
@@ -165,7 +164,7 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
en_dbg(DRV, priv, " frag %d allocator: - size:%d frags:%d\n",
i, ring->page_alloc[i].page_size,
- atomic_read(&ring->page_alloc[i].page->_count));
+ page_ref_count(ring->page_alloc[i].page));
}
return 0;
@@ -177,7 +176,7 @@ out:
dma_unmap_page(priv->ddev, page_alloc->dma,
page_alloc->page_size, PCI_DMA_FROMDEVICE);
page = page_alloc->page;
- atomic_set(&page->_count, 1);
+ set_page_count(page, 1);
put_page(page);
page_alloc->page = NULL;
}
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index ab6051a43134..9cc45649f477 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -3341,7 +3341,7 @@ static int niu_rbr_add_page(struct niu *np, struct rx_ring_info *rp,
niu_hash_page(rp, page, addr);
if (rp->rbr_blocks_per_page > 1)
- atomic_add(rp->rbr_blocks_per_page - 1, &page->_count);
+ page_ref_add(page, rp->rbr_blocks_per_page - 1);
for (i = 0; i < rp->rbr_blocks_per_page; i++) {
__le32 *rbr = &rp->rbr[start_index + i];
diff --git a/drivers/net/wireless/marvell/mwifiex/debugfs.c b/drivers/net/wireless/marvell/mwifiex/debugfs.c
index 0b9c580af988..2eff989c6d9f 100644
--- a/drivers/net/wireless/marvell/mwifiex/debugfs.c
+++ b/drivers/net/wireless/marvell/mwifiex/debugfs.c
@@ -880,14 +880,12 @@ mwifiex_reset_write(struct file *file,
{
struct mwifiex_private *priv = file->private_data;
struct mwifiex_adapter *adapter = priv->adapter;
- char cmd;
bool result;
+ int rc;
- if (copy_from_user(&cmd, ubuf, sizeof(cmd)))
- return -EFAULT;
-
- if (strtobool(&cmd, &result))
- return -EINVAL;
+ rc = kstrtobool_from_user(ubuf, count, &result);
+ if (rc)
+ return rc;
if (!result)
return -EINVAL;
diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c
index 29984b36926a..c223a9ef1fe1 100644
--- a/drivers/pinctrl/pinmux.c
+++ b/drivers/pinctrl/pinmux.c
@@ -334,7 +334,6 @@ int pinmux_map_to_setting(struct pinctrl_map const *map,
unsigned num_groups;
int ret;
const char *group;
- int i;
if (!pmxops) {
dev_err(pctldev->dev, "does not support mux function\n");
@@ -363,19 +362,13 @@ int pinmux_map_to_setting(struct pinctrl_map const *map,
return -EINVAL;
}
if (map->data.mux.group) {
- bool found = false;
group = map->data.mux.group;
- for (i = 0; i < num_groups; i++) {
- if (!strcmp(group, groups[i])) {
- found = true;
- break;
- }
- }
- if (!found) {
+ ret = match_string(groups, num_groups, group);
+ if (ret < 0) {
dev_err(pctldev->dev,
"invalid group \"%s\" for function \"%s\"\n",
group, map->data.mux.function);
- return -EINVAL;
+ return ret;
}
} else {
group = groups[0];
diff --git a/drivers/power/ab8500_btemp.c b/drivers/power/ab8500_btemp.c
index 8f8044e1acf3..bf2e5dd301e7 100644
--- a/drivers/power/ab8500_btemp.c
+++ b/drivers/power/ab8500_btemp.c
@@ -906,26 +906,21 @@ static int ab8500_btemp_get_property(struct power_supply *psy,
static int ab8500_btemp_get_ext_psy_data(struct device *dev, void *data)
{
struct power_supply *psy;
- struct power_supply *ext;
+ struct power_supply *ext = dev_get_drvdata(dev);
+ const char **supplicants = (const char **)ext->supplied_to;
struct ab8500_btemp *di;
union power_supply_propval ret;
- int i, j;
- bool psy_found = false;
+ int j;
psy = (struct power_supply *)data;
- ext = dev_get_drvdata(dev);
di = power_supply_get_drvdata(psy);
/*
* For all psy where the name of your driver
* appears in any supplied_to
*/
- for (i = 0; i < ext->num_supplicants; i++) {
- if (!strcmp(ext->supplied_to[i], psy->desc->name))
- psy_found = true;
- }
-
- if (!psy_found)
+ j = match_string(supplicants, ext->num_supplicants, psy->desc->name);
+ if (j < 0)
return 0;
/* Go through all properties for the psy */
diff --git a/drivers/power/ab8500_charger.c b/drivers/power/ab8500_charger.c
index e388171f4e58..30de5d42b26a 100644
--- a/drivers/power/ab8500_charger.c
+++ b/drivers/power/ab8500_charger.c
@@ -1929,11 +1929,11 @@ static int ab8540_charger_usb_pre_chg_enable(struct ux500_charger *charger,
static int ab8500_charger_get_ext_psy_data(struct device *dev, void *data)
{
struct power_supply *psy;
- struct power_supply *ext;
+ struct power_supply *ext = dev_get_drvdata(dev);
+ const char **supplicants = (const char **)ext->supplied_to;
struct ab8500_charger *di;
union power_supply_propval ret;
- int i, j;
- bool psy_found = false;
+ int j;
struct ux500_charger *usb_chg;
usb_chg = (struct ux500_charger *)data;
@@ -1941,15 +1941,9 @@ static int ab8500_charger_get_ext_psy_data(struct device *dev, void *data)
di = to_ab8500_charger_usb_device_info(usb_chg);
- ext = dev_get_drvdata(dev);
-
/* For all psy where the driver name appears in any supplied_to */
- for (i = 0; i < ext->num_supplicants; i++) {
- if (!strcmp(ext->supplied_to[i], psy->desc->name))
- psy_found = true;
- }
-
- if (!psy_found)
+ j = match_string(supplicants, ext->num_supplicants, psy->desc->name);
+ if (j < 0)
return 0;
/* Go through all properties for the psy */
diff --git a/drivers/power/ab8500_fg.c b/drivers/power/ab8500_fg.c
index 3830dade5d69..5a36cf88578a 100644
--- a/drivers/power/ab8500_fg.c
+++ b/drivers/power/ab8500_fg.c
@@ -2168,26 +2168,21 @@ static int ab8500_fg_get_property(struct power_supply *psy,
static int ab8500_fg_get_ext_psy_data(struct device *dev, void *data)
{
struct power_supply *psy;
- struct power_supply *ext;
+ struct power_supply *ext = dev_get_drvdata(dev);
+ const char **supplicants = (const char **)ext->supplied_to;
struct ab8500_fg *di;
union power_supply_propval ret;
- int i, j;
- bool psy_found = false;
+ int j;
psy = (struct power_supply *)data;
- ext = dev_get_drvdata(dev);
di = power_supply_get_drvdata(psy);
/*
* For all psy where the name of your driver
* appears in any supplied_to
*/
- for (i = 0; i < ext->num_supplicants; i++) {
- if (!strcmp(ext->supplied_to[i], psy->desc->name))
- psy_found = true;
- }
-
- if (!psy_found)
+ j = match_string(supplicants, ext->num_supplicants, psy->desc->name);
+ if (j < 0)
return 0;
/* Go through all properties for the psy */
diff --git a/drivers/power/abx500_chargalg.c b/drivers/power/abx500_chargalg.c
index 541f702e0451..d9104b1ab7cf 100644
--- a/drivers/power/abx500_chargalg.c
+++ b/drivers/power/abx500_chargalg.c
@@ -975,22 +975,18 @@ static void handle_maxim_chg_curr(struct abx500_chargalg *di)
static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
{
struct power_supply *psy;
- struct power_supply *ext;
+ struct power_supply *ext = dev_get_drvdata(dev);
+ const char **supplicants = (const char **)ext->supplied_to;
struct abx500_chargalg *di;
union power_supply_propval ret;
- int i, j;
- bool psy_found = false;
+ int j;
bool capacity_updated = false;
psy = (struct power_supply *)data;
- ext = dev_get_drvdata(dev);
di = power_supply_get_drvdata(psy);
/* For all psy where the driver name appears in any supplied_to */
- for (i = 0; i < ext->num_supplicants; i++) {
- if (!strcmp(ext->supplied_to[i], psy->desc->name))
- psy_found = true;
- }
- if (!psy_found)
+ j = match_string(supplicants, ext->num_supplicants, psy->desc->name);
+ if (j < 0)
return 0;
/*
diff --git a/drivers/power/charger-manager.c b/drivers/power/charger-manager.c
index 1ea5d1aa268b..e664ca7c0afd 100644
--- a/drivers/power/charger-manager.c
+++ b/drivers/power/charger-manager.c
@@ -2020,27 +2020,6 @@ static void __exit charger_manager_cleanup(void)
module_exit(charger_manager_cleanup);
/**
- * find_power_supply - find the associated power_supply of charger
- * @cm: the Charger Manager representing the battery
- * @psy: pointer to instance of charger's power_supply
- */
-static bool find_power_supply(struct charger_manager *cm,
- struct power_supply *psy)
-{
- int i;
- bool found = false;
-
- for (i = 0; cm->desc->psy_charger_stat[i]; i++) {
- if (!strcmp(psy->desc->name, cm->desc->psy_charger_stat[i])) {
- found = true;
- break;
- }
- }
-
- return found;
-}
-
-/**
* cm_notify_event - charger driver notify Charger Manager of charger event
* @psy: pointer to instance of charger's power_supply
* @type: type of charger event
@@ -2057,9 +2036,11 @@ void cm_notify_event(struct power_supply *psy, enum cm_event_types type,
mutex_lock(&cm_list_mtx);
list_for_each_entry(cm, &cm_list, entry) {
- found_power_supply = find_power_supply(cm, psy);
- if (found_power_supply)
+ if (match_string(cm->desc->psy_charger_stat, -1,
+ psy->desc->name) >= 0) {
+ found_power_supply = true;
break;
+ }
}
mutex_unlock(&cm_list_mtx);
diff --git a/drivers/usb/common/common.c b/drivers/usb/common/common.c
index 49fbfe8b0f24..e3d01619d6b3 100644
--- a/drivers/usb/common/common.c
+++ b/drivers/usb/common/common.c
@@ -65,18 +65,15 @@ EXPORT_SYMBOL_GPL(usb_speed_string);
enum usb_device_speed usb_get_maximum_speed(struct device *dev)
{
const char *maximum_speed;
- int err;
- int i;
+ int ret;
- err = device_property_read_string(dev, "maximum-speed", &maximum_speed);
- if (err < 0)
+ ret = device_property_read_string(dev, "maximum-speed", &maximum_speed);
+ if (ret < 0)
return USB_SPEED_UNKNOWN;
- for (i = 0; i < ARRAY_SIZE(speed_names); i++)
- if (strcmp(maximum_speed, speed_names[i]) == 0)
- return i;
+ ret = match_string(speed_names, ARRAY_SIZE(speed_names), maximum_speed);
- return USB_SPEED_UNKNOWN;
+ return (ret < 0) ? USB_SPEED_UNKNOWN : ret;
}
EXPORT_SYMBOL_GPL(usb_get_maximum_speed);
@@ -110,13 +107,10 @@ static const char *const usb_dr_modes[] = {
static enum usb_dr_mode usb_get_dr_mode_from_string(const char *str)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(usb_dr_modes); i++)
- if (!strcmp(usb_dr_modes[i], str))
- return i;
+ int ret;
- return USB_DR_MODE_UNKNOWN;
+ ret = match_string(usb_dr_modes, ARRAY_SIZE(usb_dr_modes), str);
+ return (ret < 0) ? USB_DR_MODE_UNKNOWN : ret;
}
enum usb_dr_mode usb_get_dr_mode(struct device *dev)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 0c3691f46575..f2b77dea8d3c 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -30,6 +30,7 @@
#include <linux/balloon_compaction.h>
#include <linux/oom.h>
#include <linux/wait.h>
+#include <linux/mm.h>
/*
* Balloon device works in 4K page units. So each page is pointed to by
@@ -229,10 +230,13 @@ static void update_balloon_stats(struct virtio_balloon *vb)
unsigned long events[NR_VM_EVENT_ITEMS];
struct sysinfo i;
int idx = 0;
+ long available;
all_vm_events(events);
si_meminfo(&i);
+ available = si_mem_available();
+
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
pages_to_bytes(events[PSWPIN]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
@@ -243,6 +247,8 @@ static void update_balloon_stats(struct virtio_balloon *vb)
pages_to_bytes(i.freeram));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT,
pages_to_bytes(i.totalram));
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL,
+ pages_to_bytes(available));
}
/*
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 0e1e61a7ec23..1c76d73e06dc 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -137,7 +137,6 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
void **slot;
spin_lock(&fs_info->buffer_lock);
-restart:
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
struct extent_buffer *eb;
@@ -147,7 +146,7 @@ restart:
/* Shouldn't happen but that kind of thinking creates CVE's */
if (radix_tree_exception(eb)) {
if (radix_tree_deref_retry(eb))
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
continue;
}
spin_unlock(&fs_info->buffer_lock);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 50b268483302..788e19195991 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -255,7 +255,6 @@ static const struct file_operations cifs_debug_data_proc_fops = {
static ssize_t cifs_stats_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
bool bv;
int rc;
struct list_head *tmp1, *tmp2, *tmp3;
@@ -263,11 +262,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
struct cifs_ses *ses;
struct cifs_tcon *tcon;
- rc = get_user(c, buffer);
- if (rc)
- return rc;
-
- if (strtobool(&c, &bv) == 0) {
+ rc = kstrtobool_from_user(buffer, count, &bv);
+ if (rc == 0) {
#ifdef CONFIG_CIFS_STATS2
atomic_set(&totBufAllocCount, 0);
atomic_set(&totSmBufAllocCount, 0);
@@ -290,6 +286,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
}
}
spin_unlock(&cifs_tcp_ses_lock);
+ } else {
+ return rc;
}
return count;
@@ -433,17 +431,17 @@ static int cifsFYI_proc_open(struct inode *inode, struct file *file)
static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char c;
+ char c[2] = { '\0' };
bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = get_user(c[0], buffer);
if (rc)
return rc;
- if (strtobool(&c, &bv) == 0)
+ if (strtobool(c, &bv) == 0)
cifsFYI = bv;
- else if ((c > '1') && (c <= '9'))
- cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */
+ else if ((c[0] > '1') && (c[0] <= '9'))
+ cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
return count;
}
@@ -471,20 +469,12 @@ static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
static ssize_t cifs_linux_ext_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &linuxExtEnabled);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- linuxExtEnabled = bv;
-
return count;
}
@@ -511,20 +501,12 @@ static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
static ssize_t cifs_lookup_cache_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &lookupCacheEnabled);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- lookupCacheEnabled = bv;
-
return count;
}
@@ -551,20 +533,12 @@ static int traceSMB_proc_open(struct inode *inode, struct file *file)
static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &traceSMB);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- traceSMB = bv;
-
return count;
}
@@ -622,7 +596,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
int rc;
unsigned int flags;
char flags_string[12];
- char c;
bool bv;
if ((count < 1) || (count > 11))
@@ -635,11 +608,10 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
if (count < 3) {
/* single char or single char followed by null */
- c = flags_string[0];
- if (strtobool(&c, &bv) == 0) {
+ if (strtobool(flags_string, &bv) == 0) {
global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF;
return count;
- } else if (!isdigit(c)) {
+ } else if (!isdigit(flags_string[0])) {
cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
flags_string);
return -EINVAL;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 66cf0f9fff89..c611ca2339d7 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -25,7 +25,7 @@
void cifs_dump_mem(char *label, void *data, int length);
void cifs_dump_detail(void *);
void cifs_dump_mids(struct TCP_Server_Info *);
-extern int traceSMB; /* flag which enables the function below */
+extern bool traceSMB; /* flag which enables the function below */
void dump_smb(void *, int);
#define CIFS_INFO 0x01
#define CIFS_RC 0x02
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2eea40353e60..fd8805de6a50 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -54,10 +54,10 @@
#endif
int cifsFYI = 0;
-int traceSMB = 0;
+bool traceSMB;
bool enable_oplocks = true;
-unsigned int linuxExtEnabled = 1;
-unsigned int lookupCacheEnabled = 1;
+bool linuxExtEnabled = true;
+bool lookupCacheEnabled = true;
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a25b2513f146..d21da9f05bae 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1596,11 +1596,11 @@ GLOBAL_EXTERN atomic_t midCount;
/* Misc globals */
GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
-GLOBAL_EXTERN unsigned int lookupCacheEnabled;
+GLOBAL_EXTERN bool lookupCacheEnabled;
GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
-GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */
GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cde60741cad2..8a74a2a52e0f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1616,7 +1616,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
unsigned long flags;
- long slack = 0;
+ u64 slack = 0;
wait_queue_t wait;
ktime_t expires, *to = NULL;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 45d650addd56..c20df77eff99 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -180,7 +180,7 @@ void nilfs_page_bug(struct page *page)
printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
"mapping=%p ino=%lu\n",
- page, atomic_read(&page->_count),
+ page, page_ref_count(page),
(unsigned long long)page->index, page->flags, m, ino);
if (page_has_buffers(page)) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4f764c2ac1a5..b1755b23893e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -434,7 +434,7 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
&& !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
else
- seq_putc(m, '0');
+ seq_puts(m, "0\n");
return 0;
}
@@ -2158,6 +2158,7 @@ static const struct file_operations proc_map_files_operations = {
.llseek = default_llseek,
};
+#ifdef CONFIG_CHECKPOINT_RESTORE
struct timers_private {
struct pid *pid;
struct task_struct *task;
@@ -2256,6 +2257,73 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
+#endif
+
+static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *p;
+ u64 slack_ns;
+ int err;
+
+ err = kstrtoull_from_user(buf, count, 10, &slack_ns);
+ if (err < 0)
+ return err;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+ task_lock(p);
+ if (slack_ns == 0)
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ else
+ p->timer_slack_ns = slack_ns;
+ task_unlock(p);
+ } else
+ count = -EPERM;
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int timerslack_ns_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+ int err = 0;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+ task_lock(p);
+ seq_printf(m, "%llu\n", p->timer_slack_ns);
+ task_unlock(p);
+ } else
+ err = -EPERM;
+
+ put_task_struct(p);
+
+ return err;
+}
+
+static int timerslack_ns_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timerslack_ns_show, inode);
+}
+
+static const struct file_operations proc_pid_set_timerslack_ns_operations = {
+ .open = timerslack_ns_open,
+ .read = seq_read,
+ .write = timerslack_ns_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2831,6 +2899,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_CHECKPOINT_RESTORE
REG("timers", S_IRUGO, proc_timers_operations),
#endif
+ REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index df4661abadc4..83720460c5bc 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -29,10 +29,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
unsigned long committed;
long cached;
long available;
- unsigned long pagecache;
- unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
- struct zone *zone;
int lru;
/*
@@ -51,33 +48,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
- for_each_zone(zone)
- wmark_low += zone->watermark[WMARK_LOW];
-
- /*
- * Estimate the amount of memory available for userspace allocations,
- * without causing swapping.
- */
- available = i.freeram - totalreserve_pages;
-
- /*
- * Not all the page cache can be freed, otherwise the system will
- * start swapping. Assume at least half of the page cache, or the
- * low watermark worth of cache, needs to stay.
- */
- pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
- pagecache -= min(pagecache / 2, wmark_low);
- available += pagecache;
-
- /*
- * Part of the reclaimable slab consists of items that are in use,
- * and cannot be freed. Cap this estimate at the low watermark.
- */
- available += global_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
-
- if (available < 0)
- available = 0;
+ available = si_mem_available();
/*
* Tagged format, for easy grepping and expansion.
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b2855eea5405..712f1b9992cc 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapcount() is not enough.
+ * simple test in page_mapped() is not enough.
*/
- if (!PageSlab(page) && page_mapcount(page))
+ if (!PageSlab(page) && page_mapped(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
@@ -148,6 +148,8 @@ u64 stable_page_flags(struct page *page)
*/
if (PageBuddy(page))
u |= 1 << KPF_BUDDY;
+ else if (page_count(page) == 0 && is_free_buddy_page(page))
+ u |= 1 << KPF_BUDDY;
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
@@ -158,6 +160,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
+ if (PageTail(page) && PageSlab(compound_head(page)))
+ u |= 1 << KPF_SLAB;
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4e61388ec03d..55bb57e6a30d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -231,7 +231,9 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
list_for_each_entry(m, &vmcore_list, list) {
if (*fpos < m->offset + m->size) {
- tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+ tsz = (size_t)min_t(unsigned long long,
+ m->offset + m->size - *fpos,
+ buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
if (tmp < 0)
@@ -461,7 +463,8 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
if (start < m->offset + m->size) {
u64 paddr = 0;
- tsz = min_t(size_t, m->offset + m->size - start, size);
+ tsz = (size_t)min_t(unsigned long long,
+ m->offset + m->size - start, size);
paddr = m->paddr + start - m->offset;
if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
paddr >> PAGE_SHIFT, tsz,
diff --git a/fs/select.c b/fs/select.c
index 79d0d4953cad..869293988c2a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv)
return slack;
}
-long select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec *tv)
{
- unsigned long ret;
+ u64 ret;
struct timespec now;
/*
@@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
struct poll_wqueues table;
poll_table *wait;
int retval, i, timed_out = 0;
- unsigned long slack = 0;
+ u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_end = 0;
@@ -784,7 +784,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
- unsigned long slack = 0;
+ u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_end = 0;
diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index eb1973bad80b..5e1f345b58dd 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -98,14 +98,14 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
#define atomic_long_xchg(v, new) \
(ATOMIC_LONG_PFX(_xchg)((ATOMIC_LONG_PFX(_t) *)(v), (new)))
-static inline void atomic_long_inc(atomic_long_t *l)
+static __always_inline void atomic_long_inc(atomic_long_t *l)
{
ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
ATOMIC_LONG_PFX(_inc)(v);
}
-static inline void atomic_long_dec(atomic_long_t *l)
+static __always_inline void atomic_long_dec(atomic_long_t *l)
{
ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
@@ -113,7 +113,7 @@ static inline void atomic_long_dec(atomic_long_t *l)
}
#define ATOMIC_LONG_OP(op) \
-static inline void \
+static __always_inline void \
atomic_long_##op(long i, atomic_long_t *l) \
{ \
ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l; \
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 630dd2372238..f90588abbfd4 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -81,6 +81,12 @@ extern void warn_slowpath_null(const char *file, const int line);
do { printk(arg); __WARN_TAINT(taint); } while (0)
#endif
+/* used internally by panic.c */
+struct warn_args;
+
+void __warn(const char *file, int line, void *caller, unsigned taint,
+ struct pt_regs *regs, struct warn_args *args);
+
#ifndef WARN_ON
#define WARN_ON(condition) ({ \
int __ret_warn_on = !!(condition); \
@@ -110,9 +116,10 @@ extern void warn_slowpath_null(const char *file, const int line);
static bool __section(.data.unlikely) __warned; \
int __ret_warn_once = !!(condition); \
\
- if (unlikely(__ret_warn_once)) \
- if (WARN_ON(!__warned)) \
- __warned = true; \
+ if (unlikely(__ret_warn_once && !__warned)) { \
+ __warned = true; \
+ WARN_ON(1); \
+ } \
unlikely(__ret_warn_once); \
})
@@ -120,9 +127,10 @@ extern void warn_slowpath_null(const char *file, const int line);
static bool __section(.data.unlikely) __warned; \
int __ret_warn_once = !!(condition); \
\
- if (unlikely(__ret_warn_once)) \
- if (WARN(!__warned, format)) \
- __warned = true; \
+ if (unlikely(__ret_warn_once && !__warned)) { \
+ __warned = true; \
+ WARN(1, format); \
+ } \
unlikely(__ret_warn_once); \
})
@@ -130,9 +138,10 @@ extern void warn_slowpath_null(const char *file, const int line);
static bool __section(.data.unlikely) __warned; \
int __ret_warn_once = !!(condition); \
\
- if (unlikely(__ret_warn_once)) \
- if (WARN_TAINT(!__warned, taint, format)) \
- __warned = true; \
+ if (unlikely(__ret_warn_once && !__warned)) { \
+ __warned = true; \
+ WARN_TAINT(1, taint, format); \
+ } \
unlikely(__ret_warn_once); \
})
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index c370b261c720..9401f4819891 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -783,6 +783,23 @@ static inline int pmd_clear_huge(pmd_t *pmd)
}
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * ARCHes with special requirements for evicting THP backing TLB entries can
+ * implement this. Otherwise also, it can help optimize normal TLB flush in
+ * THP regime. stock flush_tlb_range() typically has optimization to nuke the
+ * entire TLB TLB if flush span is greater than a threshold, which will
+ * likely be true for a single huge page. Thus a single thp flush will
+ * invalidate the entire TLB which is not desitable.
+ * e.g. see arch/arc: flush_pmd_tlb_range
+ */
+#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
+#else
+#define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG()
+#endif
+#endif
+
#endif /* !__ASSEMBLY__ */
#ifndef io_remap_pfn_range
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 89d9aa9e79bf..c67f052cc5e5 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -82,15 +82,15 @@ struct buffer_head {
* and buffer_foo() functions.
*/
#define BUFFER_FNS(bit, name) \
-static inline void set_buffer_##name(struct buffer_head *bh) \
+static __always_inline void set_buffer_##name(struct buffer_head *bh) \
{ \
set_bit(BH_##bit, &(bh)->b_state); \
} \
-static inline void clear_buffer_##name(struct buffer_head *bh) \
+static __always_inline void clear_buffer_##name(struct buffer_head *bh) \
{ \
clear_bit(BH_##bit, &(bh)->b_state); \
} \
-static inline int buffer_##name(const struct buffer_head *bh) \
+static __always_inline int buffer_##name(const struct buffer_head *bh) \
{ \
return test_bit(BH_##bit, &(bh)->b_state); \
}
@@ -99,11 +99,11 @@ static inline int buffer_##name(const struct buffer_head *bh) \
* test_set_buffer_foo() and test_clear_buffer_foo()
*/
#define TAS_BUFFER_FNS(bit, name) \
-static inline int test_set_buffer_##name(struct buffer_head *bh) \
+static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \
{ \
return test_and_set_bit(BH_##bit, &(bh)->b_state); \
} \
-static inline int test_clear_buffer_##name(struct buffer_head *bh) \
+static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \
{ \
return test_and_clear_bit(BH_##bit, &(bh)->b_state); \
} \
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 4cd4ddf64cc7..d7c8de583a23 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -52,6 +52,10 @@ extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
extern bool compaction_restarting(struct zone *zone, int order);
+extern int kcompactd_run(int nid);
+extern void kcompactd_stop(int nid);
+extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
+
#else
static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, int alloc_flags,
@@ -84,6 +88,18 @@ static inline bool compaction_deferred(struct zone *zone, int order)
return true;
}
+static inline int kcompactd_run(int nid)
+{
+ return 0;
+}
+static inline void kcompactd_stop(int nid)
+{
+}
+
+static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+}
+
#endif /* CONFIG_COMPACTION */
#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 6b7fd9cf5ea2..dd03e837ebb7 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -231,7 +231,7 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
* call this with locks held.
*/
static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
- unsigned long delta, const enum hrtimer_mode mode)
+ u64 delta, const enum hrtimer_mode mode)
{
int __retval;
freezer_do_not_count();
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index bb16dfeb917e..570383a41853 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -105,8 +105,6 @@ struct vm_area_struct;
*
* __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
* This takes precedence over the __GFP_MEMALLOC flag if both are set.
- *
- * __GFP_NOACCOUNT ignores the accounting for kmemcg limit enforcement.
*/
#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC)
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
@@ -259,7 +257,7 @@ struct vm_area_struct;
#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE)
#define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \
- ~__GFP_KSWAPD_RECLAIM)
+ ~__GFP_RECLAIM)
/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
@@ -333,22 +331,29 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
* 0xe => BAD (MOVABLE+DMA32+HIGHMEM)
* 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
*
- * ZONES_SHIFT must be <= 2 on 32 bit platforms.
+ * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
*/
-#if 16 * ZONES_SHIFT > BITS_PER_LONG
-#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
+#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
+/* ZONE_DEVICE is not a valid GFP zone specifier */
+#define GFP_ZONES_SHIFT 2
+#else
+#define GFP_ZONES_SHIFT ZONES_SHIFT
+#endif
+
+#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
+#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif
#define GFP_ZONE_TABLE ( \
- (ZONE_NORMAL << 0 * ZONES_SHIFT) \
- | (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT) \
- | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT) \
- | (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT) \
- | (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT) \
- | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT) \
- | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT) \
- | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT) \
+ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \
+ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \
+ | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \
+ | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \
+ | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \
+ | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \
+ | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
+ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)
/*
@@ -373,8 +378,8 @@ static inline enum zone_type gfp_zone(gfp_t flags)
enum zone_type z;
int bit = (__force int) (flags & GFP_ZONEMASK);
- z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
- ((1 << ZONES_SHIFT) - 1);
+ z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
+ ((1 << GFP_ZONES_SHIFT) - 1);
VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
return z;
}
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2ead22dd74a0..c98c6539e2c2 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -220,7 +220,7 @@ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time
timer->node.expires = ktime_add_safe(time, delta);
}
-static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
+static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta)
{
timer->_softexpires = time;
timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
@@ -378,7 +378,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
/* Basic timer operations: */
extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long range_ns, const enum hrtimer_mode mode);
+ u64 range_ns, const enum hrtimer_mode mode);
/**
* hrtimer_start - (re)start an hrtimer on the current CPU
@@ -399,7 +399,7 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer);
static inline void hrtimer_start_expires(struct hrtimer *timer,
enum hrtimer_mode mode)
{
- unsigned long delta;
+ u64 delta;
ktime_t soft, hard;
soft = hrtimer_get_softexpires(timer);
hard = hrtimer_get_expires(timer);
@@ -477,10 +477,12 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
struct task_struct *tsk);
-extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode);
extern int schedule_hrtimeout_range_clock(ktime_t *expires,
- unsigned long delta, const enum hrtimer_mode mode, int clock);
+ u64 delta,
+ const enum hrtimer_mode mode,
+ int clock);
extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
/* Soft interrupt function to run the hrtimer queues: */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 459fd25b378e..5307dfb3f8ec 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -41,7 +41,8 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
- TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
@@ -71,12 +72,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
((__vma)->vm_flags & VM_HUGEPAGE))) && \
!((__vma)->vm_flags & VM_NOHUGEPAGE) && \
!is_vma_temporary_stack(__vma))
-#define transparent_hugepage_defrag(__vma) \
- ((transparent_hugepage_flags & \
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) || \
- (transparent_hugepage_flags & \
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \
- (__vma)->vm_flags & VM_HUGEPAGE))
#define transparent_hugepage_use_zero_page() \
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
@@ -101,16 +96,21 @@ static inline int split_huge_page(struct page *page)
void deferred_split_huge_page(struct page *page);
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address);
+ unsigned long address, bool freeze);
#define split_huge_pmd(__vma, __pmd, __address) \
do { \
pmd_t *____pmd = (__pmd); \
if (pmd_trans_huge(*____pmd) \
|| pmd_devmap(*____pmd)) \
- __split_huge_pmd(__vma, __pmd, __address); \
+ __split_huge_pmd(__vma, __pmd, __address, \
+ false); \
} while (0)
+
+void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
+ bool freeze, struct page *page);
+
#if HPAGE_PMD_ORDER >= MAX_ORDER
#error "hugepages can't be allocated by the buddy allocator"
#endif
@@ -178,6 +178,10 @@ static inline int split_huge_page(struct page *page)
static inline void deferred_split_huge_page(struct page *page) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
+
+static inline void split_huge_pmd_address(struct vm_area_struct *vma,
+ unsigned long address, bool freeze, struct page *page) {}
+
static inline int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f31638c6e873..f4fa2b29c38c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -357,6 +357,7 @@ int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
+int __must_check kstrtobool(const char *s, bool *res);
int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
@@ -368,6 +369,7 @@ int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigne
int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
+int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res);
static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
{
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index ee7229a6c06a..cb483305e1f5 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -48,7 +48,7 @@ static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)
-static inline int hlist_bl_unhashed(const struct hlist_bl_node *h)
+static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h)
{
return !h->pprev;
}
@@ -68,7 +68,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h,
h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}
-static inline int hlist_bl_empty(const struct hlist_bl_head *h)
+static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f0c4bec6565b..1191d79aa495 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,7 +52,10 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
/* default hierarchy stats */
- MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS,
+ MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS,
+ MEMCG_SLAB_RECLAIMABLE,
+ MEMCG_SLAB_UNRECLAIMABLE,
+ MEMCG_SOCK,
MEMCG_NR_STAT,
};
@@ -400,6 +403,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages);
+unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask);
+
static inline
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
@@ -658,6 +664,13 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
{
}
+static inline unsigned long
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
+{
+ return 0;
+}
+
static inline void
mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
@@ -792,11 +805,6 @@ static inline bool memcg_kmem_enabled(void)
return static_branch_unlikely(&memcg_kmem_enabled_key);
}
-static inline bool memcg_kmem_online(struct mem_cgroup *memcg)
-{
- return memcg->kmem_state == KMEM_ONLINE;
-}
-
/*
* In general, we'll do everything in our power to not incur in any overhead
* for non-memcg users for the kmem functions. Not even a function call, if we
@@ -883,6 +891,20 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
if (memcg_kmem_enabled())
__memcg_kmem_put_cache(cachep);
}
+
+/**
+ * memcg_kmem_update_page_stat - update kmem page state statistics
+ * @page: the page
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ */
+static inline void memcg_kmem_update_page_stat(struct page *page,
+ enum mem_cgroup_stat_index idx, int val)
+{
+ if (memcg_kmem_enabled() && page->mem_cgroup)
+ this_cpu_add(page->mem_cgroup->stat->count[idx], val);
+}
+
#else
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
@@ -892,11 +914,6 @@ static inline bool memcg_kmem_enabled(void)
return false;
}
-static inline bool memcg_kmem_online(struct mem_cgroup *memcg)
-{
- return false;
-}
-
static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
return 0;
@@ -928,6 +945,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
}
+
+static inline void memcg_kmem_update_page_stat(struct page *page,
+ enum mem_cgroup_stat_index idx, int val)
+{
+}
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dbf1eddab964..7d42501c8bb4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -22,6 +22,7 @@
#include <linux/resource.h>
#include <linux/page_ext.h>
#include <linux/err.h>
+#include <linux/page_ref.h>
struct mempolicy;
struct anon_vma;
@@ -82,6 +83,27 @@ extern int mmap_rnd_compat_bits __read_mostly;
#define mm_forbids_zeropage(X) (0)
#endif
+/*
+ * Default maximum number of active map areas, this limits the number of vmas
+ * per mm struct. Users can overwrite this number by sysctl but there is a
+ * problem.
+ *
+ * When a program's coredump is generated as ELF format, a section is created
+ * per a vma. In ELF, the number of sections is represented in unsigned short.
+ * This means the number of sections should be smaller than 65535 at coredump.
+ * Because the kernel adds some informative sections to a image of program at
+ * generating coredump, we need some margin. The number of extra sections is
+ * 1-3 now and depends on arch. We use "5" as safe margin, here.
+ *
+ * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
+ * not a hard limit any more. Although some userspace tools can be surprised by
+ * that.
+ */
+#define MAPCOUNT_ELF_CORE_MARGIN (5)
+#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+
+extern int sysctl_max_map_count;
+
extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
@@ -122,6 +144,7 @@ extern unsigned int kobjsize(const void *objp);
/*
* vm_flags in vm_area_struct, see mm_types.h.
+ * When changing, update also include/trace/events/mmflags.h
*/
#define VM_NONE 0x00000000
@@ -364,8 +387,8 @@ static inline int pmd_devmap(pmd_t pmd)
*/
static inline int put_page_testzero(struct page *page)
{
- VM_BUG_ON_PAGE(atomic_read(&page->_count) == 0, page);
- return atomic_dec_and_test(&page->_count);
+ VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+ return page_ref_dec_and_test(page);
}
/*
@@ -376,7 +399,7 @@ static inline int put_page_testzero(struct page *page)
*/
static inline int get_page_unless_zero(struct page *page)
{
- return atomic_inc_not_zero(&page->_count);
+ return page_ref_add_unless(page, 1, 0);
}
extern int page_is_ram(unsigned long pfn);
@@ -464,11 +487,6 @@ static inline int total_mapcount(struct page *page)
}
#endif
-static inline int page_count(struct page *page)
-{
- return atomic_read(&compound_head(page)->_count);
-}
-
static inline struct page *virt_to_head_page(const void *x)
{
struct page *page = virt_to_page(x);
@@ -476,15 +494,6 @@ static inline struct page *virt_to_head_page(const void *x)
return compound_head(page);
}
-/*
- * Setup the page count before being freed into the page allocator for
- * the first time (boot or memory hotplug)
- */
-static inline void init_page_count(struct page *page)
-{
- atomic_set(&page->_count, 1);
-}
-
void __put_page(struct page *page);
void put_pages_list(struct list_head *pages);
@@ -694,8 +703,8 @@ static inline void get_page(struct page *page)
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
- VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
- atomic_inc(&page->_count);
+ VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
+ page_ref_inc(page);
if (unlikely(is_zone_device_page(page)))
get_zone_device_page(page);
@@ -1043,8 +1052,6 @@ static inline void clear_page_pfmemalloc(struct page *page)
* just gets major/minor fault counters bumped up.
*/
-#define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
-
#define VM_FAULT_OOM 0x0001
#define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR 0x0004
@@ -1523,8 +1530,7 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm)
}
#endif
-int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
- pmd_t *pmd, unsigned long address);
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
/*
@@ -1650,15 +1656,15 @@ static inline void pgtable_page_dtor(struct page *page)
pte_unmap(pte); \
} while (0)
-#define pte_alloc_map(mm, vma, pmd, address) \
- ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \
- pmd, address))? \
- NULL: pte_offset_map(pmd, address))
+#define pte_alloc(mm, pmd, address) \
+ (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd, address))
+
+#define pte_alloc_map(mm, pmd, address) \
+ (pte_alloc(mm, pmd, address) ? NULL : pte_offset_map(pmd, address))
#define pte_alloc_map_lock(mm, pmd, address, ptlp) \
- ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \
- pmd, address))? \
- NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
+ (pte_alloc(mm, pmd, address) ? \
+ NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
#define pte_alloc_kernel(pmd, address) \
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
@@ -1853,6 +1859,7 @@ extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
extern void show_mem(unsigned int flags);
+extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
@@ -1867,6 +1874,7 @@ extern void zone_pcp_reset(struct zone *zone);
/* page_alloc.c */
extern int min_free_kbytes;
+extern int watermark_scale_factor;
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6de02ac378a0..c60df9257cc7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -668,6 +668,12 @@ typedef struct pglist_data {
mem_hotplug_begin/end() */
int kswapd_max_order;
enum zone_type classzone_idx;
+#ifdef CONFIG_COMPACTION
+ int kcompactd_max_order;
+ enum zone_type kcompactd_classzone_idx;
+ wait_queue_head_t kcompactd_wait;
+ struct task_struct *kcompactd;
+#endif
#ifdef CONFIG_NUMA_BALANCING
/* Lock serializing the migrate rate limiting window */
spinlock_t numabalancing_migrate_lock;
@@ -835,6 +841,8 @@ static inline int is_highmem(struct zone *zone)
struct ctl_table;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index da523661500a..77b078c103b2 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -17,6 +17,8 @@
#define ZONES_SHIFT 1
#elif MAX_NR_ZONES <= 4
#define ZONES_SHIFT 2
+#elif MAX_NR_ZONES <= 8
+#define ZONES_SHIFT 3
#else
#error ZONES_SHIFT -- too many zones configured adjust calculation
#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 19724e6ebd26..f4ed4f1b0c77 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -144,12 +144,12 @@ static inline struct page *compound_head(struct page *page)
return page;
}
-static inline int PageTail(struct page *page)
+static __always_inline int PageTail(struct page *page)
{
return READ_ONCE(page->compound_head) & 1;
}
-static inline int PageCompound(struct page *page)
+static __always_inline int PageCompound(struct page *page)
{
return test_bit(PG_head, &page->flags) || PageTail(page);
}
@@ -184,31 +184,31 @@ static inline int PageCompound(struct page *page)
* Macros to create function definitions for page flags
*/
#define TESTPAGEFLAG(uname, lname, policy) \
-static inline int Page##uname(struct page *page) \
+static __always_inline int Page##uname(struct page *page) \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
#define SETPAGEFLAG(uname, lname, policy) \
-static inline void SetPage##uname(struct page *page) \
+static __always_inline void SetPage##uname(struct page *page) \
{ set_bit(PG_##lname, &policy(page, 1)->flags); }
#define CLEARPAGEFLAG(uname, lname, policy) \
-static inline void ClearPage##uname(struct page *page) \
+static __always_inline void ClearPage##uname(struct page *page) \
{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define __SETPAGEFLAG(uname, lname, policy) \
-static inline void __SetPage##uname(struct page *page) \
+static __always_inline void __SetPage##uname(struct page *page) \
{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
#define __CLEARPAGEFLAG(uname, lname, policy) \
-static inline void __ClearPage##uname(struct page *page) \
+static __always_inline void __ClearPage##uname(struct page *page) \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTSETFLAG(uname, lname, policy) \
-static inline int TestSetPage##uname(struct page *page) \
+static __always_inline int TestSetPage##uname(struct page *page) \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTCLEARFLAG(uname, lname, policy) \
-static inline int TestClearPage##uname(struct page *page) \
+static __always_inline int TestClearPage##uname(struct page *page) \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define PAGEFLAG(uname, lname, policy) \
@@ -371,7 +371,7 @@ PAGEFLAG(Idle, idle, PF_ANY)
#define PAGE_MAPPING_KSM 2
#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
-static inline int PageAnon(struct page *page)
+static __always_inline int PageAnon(struct page *page)
{
page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
@@ -384,7 +384,7 @@ static inline int PageAnon(struct page *page)
* is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any
* anon_vma, but to that page's node of the stable tree.
*/
-static inline int PageKsm(struct page *page)
+static __always_inline int PageKsm(struct page *page)
{
page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
@@ -415,14 +415,14 @@ static inline int PageUptodate(struct page *page)
return ret;
}
-static inline void __SetPageUptodate(struct page *page)
+static __always_inline void __SetPageUptodate(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
__set_bit(PG_uptodate, &page->flags);
}
-static inline void SetPageUptodate(struct page *page)
+static __always_inline void SetPageUptodate(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
/*
@@ -456,12 +456,12 @@ static inline void set_page_writeback_keepwrite(struct page *page)
__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
-static inline void set_compound_head(struct page *page, struct page *head)
+static __always_inline void set_compound_head(struct page *page, struct page *head)
{
WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}
-static inline void clear_compound_head(struct page *page)
+static __always_inline void clear_compound_head(struct page *page)
{
WRITE_ONCE(page->compound_head, 0);
}
@@ -593,6 +593,8 @@ static inline void __ClearPageBuddy(struct page *page)
atomic_set(&page->_mapcount, -1);
}
+extern bool is_free_buddy_page(struct page *page);
+
#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
static inline int PageBalloon(struct page *page)
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
new file mode 100644
index 000000000000..e596d5d9540e
--- /dev/null
+++ b/include/linux/page_ref.h
@@ -0,0 +1,173 @@
+#ifndef _LINUX_PAGE_REF_H
+#define _LINUX_PAGE_REF_H
+
+#include <linux/atomic.h>
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
+#include <linux/tracepoint-defs.h>
+
+extern struct tracepoint __tracepoint_page_ref_set;
+extern struct tracepoint __tracepoint_page_ref_mod;
+extern struct tracepoint __tracepoint_page_ref_mod_and_test;
+extern struct tracepoint __tracepoint_page_ref_mod_and_return;
+extern struct tracepoint __tracepoint_page_ref_mod_unless;
+extern struct tracepoint __tracepoint_page_ref_freeze;
+extern struct tracepoint __tracepoint_page_ref_unfreeze;
+
+#ifdef CONFIG_DEBUG_PAGE_REF
+
+/*
+ * Ideally we would want to use the trace_<tracepoint>_enabled() helper
+ * functions. But due to include header file issues, that is not
+ * feasible. Instead we have to open code the static key functions.
+ *
+ * See trace_##name##_enabled(void) in include/linux/tracepoint.h
+ */
+#define page_ref_tracepoint_active(t) static_key_false(&(t).key)
+
+extern void __page_ref_set(struct page *page, int v);
+extern void __page_ref_mod(struct page *page, int v);
+extern void __page_ref_mod_and_test(struct page *page, int v, int ret);
+extern void __page_ref_mod_and_return(struct page *page, int v, int ret);
+extern void __page_ref_mod_unless(struct page *page, int v, int u);
+extern void __page_ref_freeze(struct page *page, int v, int ret);
+extern void __page_ref_unfreeze(struct page *page, int v);
+
+#else
+
+#define page_ref_tracepoint_active(t) false
+
+static inline void __page_ref_set(struct page *page, int v)
+{
+}
+static inline void __page_ref_mod(struct page *page, int v)
+{
+}
+static inline void __page_ref_mod_and_test(struct page *page, int v, int ret)
+{
+}
+static inline void __page_ref_mod_and_return(struct page *page, int v, int ret)
+{
+}
+static inline void __page_ref_mod_unless(struct page *page, int v, int u)
+{
+}
+static inline void __page_ref_freeze(struct page *page, int v, int ret)
+{
+}
+static inline void __page_ref_unfreeze(struct page *page, int v)
+{
+}
+
+#endif
+
+static inline int page_ref_count(struct page *page)
+{
+ return atomic_read(&page->_count);
+}
+
+static inline int page_count(struct page *page)
+{
+ return atomic_read(&compound_head(page)->_count);
+}
+
+static inline void set_page_count(struct page *page, int v)
+{
+ atomic_set(&page->_count, v);
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
+ __page_ref_set(page, v);
+}
+
+/*
+ * Setup the page count before being freed into the page allocator for
+ * the first time (boot or memory hotplug)
+ */
+static inline void init_page_count(struct page *page)
+{
+ set_page_count(page, 1);
+}
+
+static inline void page_ref_add(struct page *page, int nr)
+{
+ atomic_add(nr, &page->_count);
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
+ __page_ref_mod(page, nr);
+}
+
+static inline void page_ref_sub(struct page *page, int nr)
+{
+ atomic_sub(nr, &page->_count);
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
+ __page_ref_mod(page, -nr);
+}
+
+static inline void page_ref_inc(struct page *page)
+{
+ atomic_inc(&page->_count);
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
+ __page_ref_mod(page, 1);
+}
+
+static inline void page_ref_dec(struct page *page)
+{
+ atomic_dec(&page->_count);
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
+ __page_ref_mod(page, -1);
+}
+
+static inline int page_ref_sub_and_test(struct page *page, int nr)
+{
+ int ret = atomic_sub_and_test(nr, &page->_count);
+
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
+ __page_ref_mod_and_test(page, -nr, ret);
+ return ret;
+}
+
+static inline int page_ref_dec_and_test(struct page *page)
+{
+ int ret = atomic_dec_and_test(&page->_count);
+
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
+ __page_ref_mod_and_test(page, -1, ret);
+ return ret;
+}
+
+static inline int page_ref_dec_return(struct page *page)
+{
+ int ret = atomic_dec_return(&page->_count);
+
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
+ __page_ref_mod_and_return(page, -1, ret);
+ return ret;
+}
+
+static inline int page_ref_add_unless(struct page *page, int nr, int u)
+{
+ int ret = atomic_add_unless(&page->_count, nr, u);
+
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless))
+ __page_ref_mod_unless(page, nr, ret);
+ return ret;
+}
+
+static inline int page_ref_freeze(struct page *page, int count)
+{
+ int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze))
+ __page_ref_freeze(page, count, ret);
+ return ret;
+}
+
+static inline void page_ref_unfreeze(struct page *page, int count)
+{
+ VM_BUG_ON_PAGE(page_count(page) != 0, page);
+ VM_BUG_ON(count == 0);
+
+ atomic_set(&page->_count, count);
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
+ __page_ref_unfreeze(page, count);
+}
+
+#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 183b15ea052b..1ebd65c91422 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -165,7 +165,7 @@ static inline int page_cache_get_speculative(struct page *page)
* SMP requires.
*/
VM_BUG_ON_PAGE(page_count(page) == 0, page);
- atomic_inc(&page->_count);
+ page_ref_inc(page);
#else
if (unlikely(!get_page_unless_zero(page))) {
@@ -194,10 +194,10 @@ static inline int page_cache_add_speculative(struct page *page, int count)
VM_BUG_ON(!in_atomic());
# endif
VM_BUG_ON_PAGE(page_count(page) == 0, page);
- atomic_add(count, &page->_count);
+ page_ref_add(page, count);
#else
- if (unlikely(!atomic_add_unless(&page->_count, count, 0)))
+ if (unlikely(!page_ref_add_unless(page, count, 0)))
return 0;
#endif
VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page);
@@ -205,19 +205,6 @@ static inline int page_cache_add_speculative(struct page *page, int count)
return 1;
}
-static inline int page_freeze_refs(struct page *page, int count)
-{
- return likely(atomic_cmpxchg(&page->_count, count, 0) == count);
-}
-
-static inline void page_unfreeze_refs(struct page *page, int count)
-{
- VM_BUG_ON_PAGE(page_count(page) != 0, page);
- VM_BUG_ON(count == 0);
-
- atomic_set(&page->_count, count);
-}
-
#ifdef CONFIG_NUMA
extern struct page *__page_cache_alloc(gfp_t gfp);
#else
diff --git a/include/linux/poll.h b/include/linux/poll.h
index c08386fb3e08..9fb4f40d9a26 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
ktime_t *expires, unsigned long slack);
-extern long select_estimate_accuracy(struct timespec *tv);
+extern u64 select_estimate_accuracy(struct timespec *tv);
static inline int poll_schedule(struct poll_wqueues *pwq, int state)
diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h
index bd466439c588..3bdfa70bc642 100644
--- a/include/linux/quicklist.h
+++ b/include/linux/quicklist.h
@@ -5,7 +5,7 @@
* as needed after allocation when they are freed. Per cpu lists of pages
* are kept that only contain node local pages.
*
- * (C) 2007, SGI. Christoph Lameter <clameter@sgi.com>
+ * (C) 2007, SGI. Christoph Lameter <cl@linux.com>
*/
#include <linux/kernel.h>
#include <linux/gfp.h>
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index f54be7082207..51a97ac8bfbf 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -21,6 +21,7 @@
#ifndef _LINUX_RADIX_TREE_H
#define _LINUX_RADIX_TREE_H
+#include <linux/bitops.h>
#include <linux/preempt.h>
#include <linux/types.h>
#include <linux/bug.h>
@@ -270,8 +271,15 @@ static inline void radix_tree_replace_slot(void **pslot, void *item)
}
int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
- struct radix_tree_node **nodep, void ***slotp);
-int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+ unsigned order, struct radix_tree_node **nodep,
+ void ***slotp);
+int __radix_tree_insert(struct radix_tree_root *, unsigned long index,
+ unsigned order, void *);
+static inline int radix_tree_insert(struct radix_tree_root *root,
+ unsigned long index, void *entry)
+{
+ return __radix_tree_insert(root, index, 0, entry);
+}
void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
struct radix_tree_node **nodep, void ***slotp);
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
@@ -395,6 +403,22 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
}
/**
+ * radix_tree_iter_next - resume iterating when the chunk may be invalid
+ * @iter: iterator state
+ *
+ * If the iterator needs to release then reacquire a lock, the chunk may
+ * have been invalidated by an insertion or deletion. Call this function
+ * to continue the iteration from the next index.
+ */
+static inline __must_check
+void **radix_tree_iter_next(struct radix_tree_iter *iter)
+{
+ iter->next_index = iter->index + 1;
+ iter->tags = 0;
+ return NULL;
+}
+
+/**
* radix_tree_chunk_size - get current chunk size
*
* @iter: pointer to radix tree iterator
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index a07f42bedda3..49eb4f8ebac9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -86,6 +86,7 @@ enum ttu_flags {
TTU_MIGRATION = 2, /* migration mode */
TTU_MUNLOCK = 4, /* munlock mode */
TTU_LZFREE = 8, /* lazy free mode */
+ TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
@@ -93,6 +94,8 @@ enum ttu_flags {
TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
* and caller guarantees they will
* do a final flush if necessary */
+ TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock:
+ * caller holds it */
};
#ifdef CONFIG_MMU
@@ -240,6 +243,8 @@ int page_mkclean(struct page *);
*/
int try_to_munlock(struct page *);
+void remove_migration_ptes(struct page *old, struct page *new, bool locked);
+
/*
* Called by memory-failure.c to kill processes.
*/
@@ -266,6 +271,7 @@ struct rmap_walk_control {
};
int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
+int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
#else /* !CONFIG_MMU */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd242bed4abb..084ed9fba620 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1791,8 +1791,8 @@ struct task_struct {
* time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
- unsigned long timer_slack_ns;
- unsigned long default_timer_slack_ns;
+ u64 timer_slack_ns;
+ u64 default_timer_slack_ns;
#ifdef CONFIG_KASAN
unsigned int kasan_depth;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 4f080ab4f2cd..22db1e63707e 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -14,27 +14,6 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
enum { sysctl_hung_task_timeout_secs = 0 };
#endif
-/*
- * Default maximum number of active map areas, this limits the number of vmas
- * per mm struct. Users can overwrite this number by sysctl but there is a
- * problem.
- *
- * When a program's coredump is generated as ELF format, a section is created
- * per a vma. In ELF, the number of sections is represented in unsigned short.
- * This means the number of sections should be smaller than 65535 at coredump.
- * Because the kernel adds some informative sections to a image of program at
- * generating coredump, we need some margin. The number of extra sections is
- * 1-3 now and depends on arch. We use "5" as safe margin, here.
- *
- * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
- * not a hard limit any more. Although some userspace tools can be surprised by
- * that.
- */
-#define MAPCOUNT_ELF_CORE_MARGIN (5)
-#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
-
-extern int sysctl_max_map_count;
-
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
diff --git a/include/linux/string.h b/include/linux/string.h
index 9eebc66d957a..d3993a79a325 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -128,7 +128,13 @@ extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
extern void argv_free(char **argv);
extern bool sysfs_streq(const char *s1, const char *s2);
-extern int strtobool(const char *s, bool *res);
+extern int kstrtobool(const char *s, bool *res);
+static inline int strtobool(const char *s, bool *res)
+{
+ return kstrtobool(s, res);
+}
+
+int match_string(const char * const *array, size_t n, const char *string);
#ifdef CONFIG_BINARY_PRINTF
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 21f73649a4dc..62be0786d6d0 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -111,7 +111,7 @@ enum tick_dep_bits {
#define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
#ifdef CONFIG_NO_HZ_COMMON
-extern int tick_nohz_enabled;
+extern bool tick_nohz_enabled;
extern int tick_nohz_tick_stopped(void);
extern void tick_nohz_idle_enter(void);
extern void tick_nohz_idle_exit(void);
diff --git a/include/linux/unaligned/access_ok.h b/include/linux/unaligned/access_ok.h
index 99c1b4d20b0f..33383ca23837 100644
--- a/include/linux/unaligned/access_ok.h
+++ b/include/linux/unaligned/access_ok.h
@@ -4,62 +4,62 @@
#include <linux/kernel.h>
#include <asm/byteorder.h>
-static inline u16 get_unaligned_le16(const void *p)
+static __always_inline u16 get_unaligned_le16(const void *p)
{
return le16_to_cpup((__le16 *)p);
}
-static inline u32 get_unaligned_le32(const void *p)
+static __always_inline u32 get_unaligned_le32(const void *p)
{
return le32_to_cpup((__le32 *)p);
}
-static inline u64 get_unaligned_le64(const void *p)
+static __always_inline u64 get_unaligned_le64(const void *p)
{
return le64_to_cpup((__le64 *)p);
}
-static inline u16 get_unaligned_be16(const void *p)
+static __always_inline u16 get_unaligned_be16(const void *p)
{
return be16_to_cpup((__be16 *)p);
}
-static inline u32 get_unaligned_be32(const void *p)
+static __always_inline u32 get_unaligned_be32(const void *p)
{
return be32_to_cpup((__be32 *)p);
}
-static inline u64 get_unaligned_be64(const void *p)
+static __always_inline u64 get_unaligned_be64(const void *p)
{
return be64_to_cpup((__be64 *)p);
}
-static inline void put_unaligned_le16(u16 val, void *p)
+static __always_inline void put_unaligned_le16(u16 val, void *p)
{
*((__le16 *)p) = cpu_to_le16(val);
}
-static inline void put_unaligned_le32(u32 val, void *p)
+static __always_inline void put_unaligned_le32(u32 val, void *p)
{
*((__le32 *)p) = cpu_to_le32(val);
}
-static inline void put_unaligned_le64(u64 val, void *p)
+static __always_inline void put_unaligned_le64(u64 val, void *p)
{
*((__le64 *)p) = cpu_to_le64(val);
}
-static inline void put_unaligned_be16(u16 val, void *p)
+static __always_inline void put_unaligned_be16(u16 val, void *p)
{
*((__be16 *)p) = cpu_to_be16(val);
}
-static inline void put_unaligned_be32(u32 val, void *p)
+static __always_inline void put_unaligned_be32(u32 val, void *p)
{
*((__be32 *)p) = cpu_to_be32(val);
}
-static inline void put_unaligned_be64(u64 val, void *p)
+static __always_inline void put_unaligned_be64(u64 val, void *p)
{
*((__be64 *)p) = cpu_to_be64(val);
}
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 67c1dbd19c6d..ec084321fe09 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -53,6 +53,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
COMPACTISOLATED,
COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
+ KCOMPACTD_WAKE,
#endif
#ifdef CONFIG_HUGETLB_PAGE
HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
@@ -71,6 +72,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_COLLAPSE_ALLOC_FAILED,
THP_SPLIT_PAGE,
THP_SPLIT_PAGE_FAILED,
+ THP_DEFERRED_SPLIT_PAGE,
THP_SPLIT_PMD,
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 111e5666e5eb..e215bf68f521 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -350,6 +350,61 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
);
#endif
+TRACE_EVENT(mm_compaction_kcompactd_sleep,
+
+ TP_PROTO(int nid),
+
+ TP_ARGS(nid),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ ),
+
+ TP_printk("nid=%d", __entry->nid)
+);
+
+DECLARE_EVENT_CLASS(kcompactd_wake_template,
+
+ TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+
+ TP_ARGS(nid, order, classzone_idx),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(enum zone_type, classzone_idx)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->classzone_idx = classzone_idx;
+ ),
+
+ TP_printk("nid=%d order=%d classzone_idx=%-8s",
+ __entry->nid,
+ __entry->order,
+ __print_symbolic(__entry->classzone_idx, ZONE_TYPE))
+);
+
+DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd,
+
+ TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+
+ TP_ARGS(nid, order, classzone_idx)
+);
+
+DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake,
+
+ TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+
+ TP_ARGS(nid, order, classzone_idx)
+);
+
#endif /* _TRACE_COMPACTION_H */
/* This part must be outside protection */
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a849185c82f0..43cedbf0c759 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -111,15 +111,21 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
) : "none"
#if defined(CONFIG_X86)
-#define __VM_ARCH_SPECIFIC {VM_PAT, "pat" }
+#define __VM_ARCH_SPECIFIC_1 {VM_PAT, "pat" }
#elif defined(CONFIG_PPC)
-#define __VM_ARCH_SPECIFIC {VM_SAO, "sao" }
+#define __VM_ARCH_SPECIFIC_1 {VM_SAO, "sao" }
#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
-#define __VM_ARCH_SPECIFIC {VM_GROWSUP, "growsup" }
+#define __VM_ARCH_SPECIFIC_1 {VM_GROWSUP, "growsup" }
#elif !defined(CONFIG_MMU)
-#define __VM_ARCH_SPECIFIC {VM_MAPPED_COPY,"mappedcopy" }
+#define __VM_ARCH_SPECIFIC_1 {VM_MAPPED_COPY,"mappedcopy" }
#else
-#define __VM_ARCH_SPECIFIC {VM_ARCH_1, "arch_1" }
+#define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" }
+#endif
+
+#if defined(CONFIG_X86)
+#define __VM_ARCH_SPECIFIC_2 {VM_MPX, "mpx" }
+#else
+#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2, "arch_2" }
#endif
#ifdef CONFIG_MEM_SOFT_DIRTY
@@ -138,19 +144,22 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
{VM_MAYEXEC, "mayexec" }, \
{VM_MAYSHARE, "mayshare" }, \
{VM_GROWSDOWN, "growsdown" }, \
+ {VM_UFFD_MISSING, "uffd_missing" }, \
{VM_PFNMAP, "pfnmap" }, \
{VM_DENYWRITE, "denywrite" }, \
- {VM_LOCKONFAULT, "lockonfault" }, \
+ {VM_UFFD_WP, "uffd_wp" }, \
{VM_LOCKED, "locked" }, \
{VM_IO, "io" }, \
{VM_SEQ_READ, "seqread" }, \
{VM_RAND_READ, "randread" }, \
{VM_DONTCOPY, "dontcopy" }, \
{VM_DONTEXPAND, "dontexpand" }, \
+ {VM_LOCKONFAULT, "lockonfault" }, \
{VM_ACCOUNT, "account" }, \
{VM_NORESERVE, "noreserve" }, \
{VM_HUGETLB, "hugetlb" }, \
- __VM_ARCH_SPECIFIC , \
+ __VM_ARCH_SPECIFIC_1 , \
+ __VM_ARCH_SPECIFIC_2 , \
{VM_DONTDUMP, "dontdump" }, \
IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
{VM_MIXEDMAP, "mixedmap" }, \
diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h
new file mode 100644
index 000000000000..81001f8b0db4
--- /dev/null
+++ b/include/trace/events/page_ref.h
@@ -0,0 +1,134 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM page_ref
+
+#if !defined(_TRACE_PAGE_REF_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGE_REF_H
+
+#include <linux/types.h>
+#include <linux/page_ref.h>
+#include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>
+
+DECLARE_EVENT_CLASS(page_ref_mod_template,
+
+ TP_PROTO(struct page *page, int v),
+
+ TP_ARGS(page, v),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(unsigned long, flags)
+ __field(int, count)
+ __field(int, mapcount)
+ __field(void *, mapping)
+ __field(int, mt)
+ __field(int, val)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = page_to_pfn(page);
+ __entry->flags = page->flags;
+ __entry->count = page_ref_count(page);
+ __entry->mapcount = page_mapcount(page);
+ __entry->mapping = page->mapping;
+ __entry->mt = get_pageblock_migratetype(page);
+ __entry->val = v;
+ ),
+
+ TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d",
+ __entry->pfn,
+ show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)),
+ __entry->count,
+ __entry->mapcount, __entry->mapping, __entry->mt,
+ __entry->val)
+);
+
+DEFINE_EVENT(page_ref_mod_template, page_ref_set,
+
+ TP_PROTO(struct page *page, int v),
+
+ TP_ARGS(page, v)
+);
+
+DEFINE_EVENT(page_ref_mod_template, page_ref_mod,
+
+ TP_PROTO(struct page *page, int v),
+
+ TP_ARGS(page, v)
+);
+
+DECLARE_EVENT_CLASS(page_ref_mod_and_test_template,
+
+ TP_PROTO(struct page *page, int v, int ret),
+
+ TP_ARGS(page, v, ret),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(unsigned long, flags)
+ __field(int, count)
+ __field(int, mapcount)
+ __field(void *, mapping)
+ __field(int, mt)
+ __field(int, val)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = page_to_pfn(page);
+ __entry->flags = page->flags;
+ __entry->count = page_ref_count(page);
+ __entry->mapcount = page_mapcount(page);
+ __entry->mapping = page->mapping;
+ __entry->mt = get_pageblock_migratetype(page);
+ __entry->val = v;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d ret=%d",
+ __entry->pfn,
+ show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)),
+ __entry->count,
+ __entry->mapcount, __entry->mapping, __entry->mt,
+ __entry->val, __entry->ret)
+);
+
+DEFINE_EVENT(page_ref_mod_and_test_template, page_ref_mod_and_test,
+
+ TP_PROTO(struct page *page, int v, int ret),
+
+ TP_ARGS(page, v, ret)
+);
+
+DEFINE_EVENT(page_ref_mod_and_test_template, page_ref_mod_and_return,
+
+ TP_PROTO(struct page *page, int v, int ret),
+
+ TP_ARGS(page, v, ret)
+);
+
+DEFINE_EVENT(page_ref_mod_and_test_template, page_ref_mod_unless,
+
+ TP_PROTO(struct page *page, int v, int ret),
+
+ TP_ARGS(page, v, ret)
+);
+
+DEFINE_EVENT(page_ref_mod_and_test_template, page_ref_freeze,
+
+ TP_PROTO(struct page *page, int v, int ret),
+
+ TP_ARGS(page, v, ret)
+);
+
+DEFINE_EVENT(page_ref_mod_template, page_ref_unfreeze,
+
+ TP_PROTO(struct page *page, int v),
+
+ TP_ARGS(page, v)
+);
+
+#endif /* _TRACE_PAGE_COUNT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/byteorder/big_endian.h b/include/uapi/linux/byteorder/big_endian.h
index 672374450095..cdab17ab907c 100644
--- a/include/uapi/linux/byteorder/big_endian.h
+++ b/include/uapi/linux/byteorder/big_endian.h
@@ -40,51 +40,51 @@
#define __cpu_to_be16(x) ((__force __be16)(__u16)(x))
#define __be16_to_cpu(x) ((__force __u16)(__be16)(x))
-static inline __le64 __cpu_to_le64p(const __u64 *p)
+static __always_inline __le64 __cpu_to_le64p(const __u64 *p)
{
return (__force __le64)__swab64p(p);
}
-static inline __u64 __le64_to_cpup(const __le64 *p)
+static __always_inline __u64 __le64_to_cpup(const __le64 *p)
{
return __swab64p((__u64 *)p);
}
-static inline __le32 __cpu_to_le32p(const __u32 *p)
+static __always_inline __le32 __cpu_to_le32p(const __u32 *p)
{
return (__force __le32)__swab32p(p);
}
-static inline __u32 __le32_to_cpup(const __le32 *p)
+static __always_inline __u32 __le32_to_cpup(const __le32 *p)
{
return __swab32p((__u32 *)p);
}
-static inline __le16 __cpu_to_le16p(const __u16 *p)
+static __always_inline __le16 __cpu_to_le16p(const __u16 *p)
{
return (__force __le16)__swab16p(p);
}
-static inline __u16 __le16_to_cpup(const __le16 *p)
+static __always_inline __u16 __le16_to_cpup(const __le16 *p)
{
return __swab16p((__u16 *)p);
}
-static inline __be64 __cpu_to_be64p(const __u64 *p)
+static __always_inline __be64 __cpu_to_be64p(const __u64 *p)
{
return (__force __be64)*p;
}
-static inline __u64 __be64_to_cpup(const __be64 *p)
+static __always_inline __u64 __be64_to_cpup(const __be64 *p)
{
return (__force __u64)*p;
}
-static inline __be32 __cpu_to_be32p(const __u32 *p)
+static __always_inline __be32 __cpu_to_be32p(const __u32 *p)
{
return (__force __be32)*p;
}
-static inline __u32 __be32_to_cpup(const __be32 *p)
+static __always_inline __u32 __be32_to_cpup(const __be32 *p)
{
return (__force __u32)*p;
}
-static inline __be16 __cpu_to_be16p(const __u16 *p)
+static __always_inline __be16 __cpu_to_be16p(const __u16 *p)
{
return (__force __be16)*p;
}
-static inline __u16 __be16_to_cpup(const __be16 *p)
+static __always_inline __u16 __be16_to_cpup(const __be16 *p)
{
return (__force __u16)*p;
}
diff --git a/include/uapi/linux/byteorder/little_endian.h b/include/uapi/linux/byteorder/little_endian.h
index d876736a0017..4b93f2b260dd 100644
--- a/include/uapi/linux/byteorder/little_endian.h
+++ b/include/uapi/linux/byteorder/little_endian.h
@@ -40,51 +40,51 @@
#define __cpu_to_be16(x) ((__force __be16)__swab16((x)))
#define __be16_to_cpu(x) __swab16((__force __u16)(__be16)(x))
-static inline __le64 __cpu_to_le64p(const __u64 *p)
+static __always_inline __le64 __cpu_to_le64p(const __u64 *p)
{
return (__force __le64)*p;
}
-static inline __u64 __le64_to_cpup(const __le64 *p)
+static __always_inline __u64 __le64_to_cpup(const __le64 *p)
{
return (__force __u64)*p;
}
-static inline __le32 __cpu_to_le32p(const __u32 *p)
+static __always_inline __le32 __cpu_to_le32p(const __u32 *p)
{
return (__force __le32)*p;
}
-static inline __u32 __le32_to_cpup(const __le32 *p)
+static __always_inline __u32 __le32_to_cpup(const __le32 *p)
{
return (__force __u32)*p;
}
-static inline __le16 __cpu_to_le16p(const __u16 *p)
+static __always_inline __le16 __cpu_to_le16p(const __u16 *p)
{
return (__force __le16)*p;
}
-static inline __u16 __le16_to_cpup(const __le16 *p)
+static __always_inline __u16 __le16_to_cpup(const __le16 *p)
{
return (__force __u16)*p;
}
-static inline __be64 __cpu_to_be64p(const __u64 *p)
+static __always_inline __be64 __cpu_to_be64p(const __u64 *p)
{
return (__force __be64)__swab64p(p);
}
-static inline __u64 __be64_to_cpup(const __be64 *p)
+static __always_inline __u64 __be64_to_cpup(const __be64 *p)
{
return __swab64p((__u64 *)p);
}
-static inline __be32 __cpu_to_be32p(const __u32 *p)
+static __always_inline __be32 __cpu_to_be32p(const __u32 *p)
{
return (__force __be32)__swab32p(p);
}
-static inline __u32 __be32_to_cpup(const __be32 *p)
+static __always_inline __u32 __be32_to_cpup(const __be32 *p)
{
return __swab32p((__u32 *)p);
}
-static inline __be16 __cpu_to_be16p(const __u16 *p)
+static __always_inline __be16 __cpu_to_be16p(const __u16 *p)
{
return (__force __be16)__swab16p(p);
}
-static inline __u16 __be16_to_cpup(const __be16 *p)
+static __always_inline __u16 __be16_to_cpup(const __be16 *p)
{
return __swab16p((__u16 *)p);
}
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index b56dfcfe922a..c3fdfe79e5cc 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -30,7 +30,6 @@
#define EM_X86_64 62 /* AMD x86-64 */
#define EM_S390 22 /* IBM S/390 */
#define EM_CRIS 76 /* Axis Communications 32-bit embedded processor */
-#define EM_V850 87 /* NEC v850 */
#define EM_M32R 88 /* Renesas M32R */
#define EM_MN10300 89 /* Panasonic/MEI MN10300, AM33 */
#define EM_OPENRISC 92 /* OpenRISC 32-bit embedded processor */
@@ -50,8 +49,6 @@
*/
#define EM_ALPHA 0x9026
-/* Bogus old v850 magic number, used by old tools. */
-#define EM_CYGNUS_V850 0x9080
/* Bogus old m32r magic number, used by old tools. */
#define EM_CYGNUS_M32R 0x9041
/* This is the old interim value for S/390 architecture */
diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 0e011eb91b5d..3f10e5317b46 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -151,7 +151,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
* __swab16p - return a byteswapped 16-bit value from a pointer
* @p: pointer to a naturally-aligned 16-bit value
*/
-static inline __u16 __swab16p(const __u16 *p)
+static __always_inline __u16 __swab16p(const __u16 *p)
{
#ifdef __arch_swab16p
return __arch_swab16p(p);
@@ -164,7 +164,7 @@ static inline __u16 __swab16p(const __u16 *p)
* __swab32p - return a byteswapped 32-bit value from a pointer
* @p: pointer to a naturally-aligned 32-bit value
*/
-static inline __u32 __swab32p(const __u32 *p)
+static __always_inline __u32 __swab32p(const __u32 *p)
{
#ifdef __arch_swab32p
return __arch_swab32p(p);
@@ -177,7 +177,7 @@ static inline __u32 __swab32p(const __u32 *p)
* __swab64p - return a byteswapped 64-bit value from a pointer
* @p: pointer to a naturally-aligned 64-bit value
*/
-static inline __u64 __swab64p(const __u64 *p)
+static __always_inline __u64 __swab64p(const __u64 *p)
{
#ifdef __arch_swab64p
return __arch_swab64p(p);
@@ -232,7 +232,7 @@ static inline void __swab16s(__u16 *p)
* __swab32s - byteswap a 32-bit value in-place
* @p: pointer to a naturally-aligned 32-bit value
*/
-static inline void __swab32s(__u32 *p)
+static __always_inline void __swab32s(__u32 *p)
{
#ifdef __arch_swab32s
__arch_swab32s(p);
@@ -245,7 +245,7 @@ static inline void __swab32s(__u32 *p)
* __swab64s - byteswap a 64-bit value in-place
* @p: pointer to a naturally-aligned 64-bit value
*/
-static inline void __swab64s(__u64 *p)
+static __always_inline void __swab64s(__u64 *p)
{
#ifdef __arch_swab64s
__arch_swab64s(p);
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index d7f1cbc3766c..343d7ddefe04 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -51,7 +51,8 @@ struct virtio_balloon_config {
#define VIRTIO_BALLOON_S_MINFLT 3 /* Number of minor faults */
#define VIRTIO_BALLOON_S_MEMFREE 4 /* Total amount of free memory */
#define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */
-#define VIRTIO_BALLOON_S_NR 6
+#define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory as in /proc */
+#define VIRTIO_BALLOON_S_NR 7
/*
* Memory statistics structure.
diff --git a/kernel/fork.c b/kernel/fork.c
index 2e391c754ae7..accb7221d547 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -164,12 +164,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
+ if (page)
+ memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ 1 << THREAD_SIZE_ORDER);
+
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ struct page *page = virt_to_page(ti);
+
+ memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ -(1 << THREAD_SIZE_ORDER));
+ __free_kmem_pages(page, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
diff --git a/kernel/panic.c b/kernel/panic.c
index d96469de72dc..fa400852bf6c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,7 @@
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/console.h>
+#include <linux/bug.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -449,20 +450,25 @@ void oops_exit(void)
kmsg_dump(KMSG_DUMP_OOPS);
}
-#ifdef WANT_WARN_ON_SLOWPATH
-struct slowpath_args {
+struct warn_args {
const char *fmt;
va_list args;
};
-static void warn_slowpath_common(const char *file, int line, void *caller,
- unsigned taint, struct slowpath_args *args)
+void __warn(const char *file, int line, void *caller, unsigned taint,
+ struct pt_regs *regs, struct warn_args *args)
{
disable_trace_on_warning();
pr_warn("------------[ cut here ]------------\n");
- pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
- raw_smp_processor_id(), current->pid, file, line, caller);
+
+ if (file)
+ pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
+ raw_smp_processor_id(), current->pid, file, line,
+ caller);
+ else
+ pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
+ raw_smp_processor_id(), current->pid, caller);
if (args)
vprintk(args->fmt, args->args);
@@ -479,20 +485,27 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
}
print_modules();
- dump_stack();
+
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
print_oops_end_marker();
+
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
}
+#ifdef WANT_WARN_ON_SLOWPATH
void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
{
- struct slowpath_args args;
+ struct warn_args args;
args.fmt = fmt;
va_start(args.args, fmt);
- warn_slowpath_common(file, line, __builtin_return_address(0),
- TAINT_WARN, &args);
+ __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL,
+ &args);
va_end(args.args);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
@@ -500,20 +513,18 @@ EXPORT_SYMBOL(warn_slowpath_fmt);
void warn_slowpath_fmt_taint(const char *file, int line,
unsigned taint, const char *fmt, ...)
{
- struct slowpath_args args;
+ struct warn_args args;
args.fmt = fmt;
va_start(args.args, fmt);
- warn_slowpath_common(file, line, __builtin_return_address(0),
- taint, &args);
+ __warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
}
EXPORT_SYMBOL(warn_slowpath_fmt_taint);
void warn_slowpath_null(const char *file, int line)
{
- warn_slowpath_common(file, line, __builtin_return_address(0),
- TAINT_WARN, NULL);
+ __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);
#endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c963ba534a78..bfbf284e4218 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -367,16 +367,20 @@ static int logbuf_has_space(u32 msg_size, bool empty)
static int log_make_free_space(u32 msg_size)
{
- while (log_first_seq < log_next_seq) {
- if (logbuf_has_space(msg_size, false))
- return 0;
+ while (log_first_seq < log_next_seq &&
+ !logbuf_has_space(msg_size, false)) {
/* drop old messages until we have enough contiguous space */
log_first_idx = log_next(log_first_idx);
log_first_seq++;
}
+ if (clear_seq < log_first_seq) {
+ clear_seq = log_first_seq;
+ clear_idx = log_first_idx;
+ }
+
/* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, true))
+ if (logbuf_has_space(msg_size, log_first_seq == log_next_seq))
return 0;
return -ENOMEM;
@@ -854,6 +858,7 @@ void log_buf_kexec_setup(void)
VMCOREINFO_SYMBOL(log_buf);
VMCOREINFO_SYMBOL(log_buf_len);
VMCOREINFO_SYMBOL(log_first_idx);
+ VMCOREINFO_SYMBOL(clear_idx);
VMCOREINFO_SYMBOL(log_next_idx);
/*
* Export struct printk_log size and field offsets. User space tools can
@@ -1216,12 +1221,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u32 idx;
enum log_flags prev;
- if (clear_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
-
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
@@ -1483,58 +1482,6 @@ static void zap_locks(void)
sema_init(&console_sem, 1);
}
-/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
- */
-static int have_callable_console(void)
-{
- struct console *con;
-
- for_each_console(con)
- if (con->flags & CON_ANYTIME)
- return 1;
-
- return 0;
-}
-
-/*
- * Can we actually use the console at this time on this cpu?
- *
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
- */
-static inline int can_use_console(unsigned int cpu)
-{
- return cpu_online(cpu) || have_callable_console();
-}
-
-/*
- * Try to get console ownership to actually show the kernel
- * messages from a 'printk'. Return true (and with the
- * console_lock held, and 'console_locked' set) if it
- * is successful, false otherwise.
- */
-static int console_trylock_for_printk(void)
-{
- unsigned int cpu = smp_processor_id();
-
- if (!console_trylock())
- return 0;
- /*
- * If we can't use the console, we need to release the console
- * semaphore by hand to avoid flushing the buffer. We need to hold the
- * console semaphore in order to do this test safely.
- */
- if (!can_use_console(cpu)) {
- console_locked = 0;
- up_console_sem();
- return 0;
- }
- return 1;
-}
-
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
@@ -1681,7 +1628,6 @@ asmlinkage int vprintk_emit(int facility, int level,
boot_delay_msec(level);
printk_delay();
- /* This stops the holder of console_sem just where we want him */
local_irq_save(flags);
this_cpu = smp_processor_id();
@@ -1705,6 +1651,7 @@ asmlinkage int vprintk_emit(int facility, int level,
}
lockdep_off();
+ /* This stops the holder of console_sem just where we want him */
raw_spin_lock(&logbuf_lock);
logbuf_cpu = this_cpu;
@@ -1810,20 +1757,12 @@ asmlinkage int vprintk_emit(int facility, int level,
if (!in_sched) {
lockdep_off();
/*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
- */
- preempt_disable();
-
- /*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
* /dev/kmsg and syslog() users.
*/
- if (console_trylock_for_printk())
+ if (console_trylock())
console_unlock();
- preempt_enable();
lockdep_on();
}
@@ -2174,7 +2113,20 @@ int console_trylock(void)
return 0;
}
console_locked = 1;
- console_may_schedule = 0;
+ /*
+ * When PREEMPT_COUNT disabled we can't reliably detect if it's
+ * safe to schedule (e.g. calling printk while holding a spin_lock),
+ * because preempt_disable()/preempt_enable() are just barriers there
+ * and preempt_count() is always 0.
+ *
+ * RCU read sections have a separate preemption counter when
+ * PREEMPT_RCU enabled thus we must take extra care and check
+ * rcu_preempt_depth(), otherwise RCU read sections modify
+ * preempt_count().
+ */
+ console_may_schedule = !oops_in_progress &&
+ preemptible() &&
+ !rcu_preempt_depth();
return 1;
}
EXPORT_SYMBOL(console_trylock);
@@ -2184,6 +2136,34 @@ int is_console_locked(void)
return console_locked;
}
+/*
+ * Check if we have any console that is capable of printing while cpu is
+ * booting or shutting down. Requires console_sem.
+ */
+static int have_callable_console(void)
+{
+ struct console *con;
+
+ for_each_console(con)
+ if ((con->flags & CON_ENABLED) &&
+ (con->flags & CON_ANYTIME))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Can we actually use the console at this time on this cpu?
+ *
+ * Console drivers may assume that per-cpu resources have been allocated. So
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
+ * call them until this CPU is officially up.
+ */
+static inline int can_use_console(void)
+{
+ return cpu_online(raw_smp_processor_id()) || have_callable_console();
+}
+
static void console_cont_flush(char *text, size_t size)
{
unsigned long flags;
@@ -2254,9 +2234,21 @@ void console_unlock(void)
do_cond_resched = console_may_schedule;
console_may_schedule = 0;
+again:
+ /*
+ * We released the console_sem lock, so we need to recheck if
+ * cpu is online and (if not) is there at least one CON_ANYTIME
+ * console.
+ */
+ if (!can_use_console()) {
+ console_locked = 0;
+ up_console_sem();
+ return;
+ }
+
/* flush buffered message fragment immediately to console */
console_cont_flush(text, sizeof(text));
-again:
+
for (;;) {
struct printk_log *msg;
size_t ext_len = 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 78947de6f969..cf8ba545c7d3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = perf_event_task_enable();
break;
case PR_GET_TIMERSLACK:
- error = current->timer_slack_ns;
+ if (current->timer_slack_ns > ULONG_MAX)
+ error = ULONG_MAX;
+ else
+ error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
if (arg2 <= 0)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5102fabef7f..725587f10667 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,6 +126,7 @@ static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
static int one_hundred = 100;
+static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -1404,6 +1405,15 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "watermark_scale_factor",
+ .data = &watermark_scale_factor,
+ .maxlen = sizeof(watermark_scale_factor),
+ .mode = 0644,
+ .proc_handler = watermark_scale_factor_sysctl_handler,
+ .extra1 = &one,
+ .extra2 = &one_thousand,
+ },
+ {
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fa909f9fd559..fa0b983290cf 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -515,7 +515,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
/*
* High resolution timer enabled ?
*/
-static int hrtimer_hres_enabled __read_mostly = 1;
+static bool hrtimer_hres_enabled __read_mostly = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);
@@ -524,13 +524,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution);
*/
static int __init setup_hrtimer_hres(char *str)
{
- if (!strcmp(str, "off"))
- hrtimer_hres_enabled = 0;
- else if (!strcmp(str, "on"))
- hrtimer_hres_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
__setup("highres=", setup_hrtimer_hres);
@@ -979,7 +973,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
* relative (HRTIMER_MODE_REL)
*/
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode)
+ u64 delta_ns, const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
@@ -1548,7 +1542,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
- unsigned long slack;
+ u64 slack;
slack = current->timer_slack_ns;
if (dl_task(current) || rt_task(current))
@@ -1724,7 +1718,7 @@ void __init hrtimers_init(void)
* @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
*/
int __sched
-schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
+schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode, int clock)
{
struct hrtimer_sleeper t;
@@ -1792,7 +1786,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
*
* Returns 0 when the timer has expired otherwise -EINTR
*/
-int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode)
{
return schedule_hrtimeout_range_clock(expires, delta, mode,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 969e6704c3c9..195fe7d2caad 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -486,20 +486,14 @@ void __init tick_nohz_init(void)
/*
* NO HZ enabled ?
*/
-int tick_nohz_enabled __read_mostly = 1;
+bool tick_nohz_enabled __read_mostly = true;
unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
static int __init setup_tick_nohz(char *str)
{
- if (!strcmp(str, "off"))
- tick_nohz_enabled = 0;
- else if (!strcmp(str, "on"))
- tick_nohz_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &tick_nohz_enabled) == 0);
}
__setup("nohz=", setup_tick_nohz);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index bbc5d1114583..d1798fa0c743 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1698,10 +1698,10 @@ EXPORT_SYMBOL(msleep_interruptible);
static void __sched do_usleep_range(unsigned long min, unsigned long max)
{
ktime_t kmin;
- unsigned long delta;
+ u64 delta;
kmin = ktime_set(0, min * NSEC_PER_USEC);
- delta = (max - min) * NSEC_PER_USEC;
+ delta = (u64)(max - min) * NSEC_PER_USEC;
schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b3ace6ebbba3..9acb29f280ec 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -923,6 +923,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
* both lockup detectors are disabled if proc_watchdog_update()
* returns an error.
*/
+ if (old == new)
+ goto out;
+
err = proc_watchdog_update();
}
out:
@@ -967,7 +970,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
int proc_watchdog_thresh(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old;
+ int err, old, new;
get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
@@ -987,6 +990,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
/*
* Update the sample period. Restore on failure.
*/
+ new = ACCESS_ONCE(watchdog_thresh);
+ if (old == new)
+ goto out;
+
set_sample_period();
err = proc_watchdog_update();
if (err) {
diff --git a/lib/bug.c b/lib/bug.c
index cff145f032a5..bc3656e944d2 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -167,19 +167,8 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
if (warning) {
/* this is a WARN_ON rather than BUG/BUG_ON */
- pr_warn("------------[ cut here ]------------\n");
-
- if (file)
- pr_warn("WARNING: at %s:%u\n", file, line);
- else
- pr_warn("WARNING: at %p [verbose debug info unavailable]\n",
- (void *)bugaddr);
-
- print_modules();
- show_regs(regs);
- print_oops_end_marker();
- /* Just a warning, don't kill lockdep. */
- add_taint(BUG_GET_TAINT(bug), LOCKDEP_STILL_OK);
+ __warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs,
+ NULL);
return BUG_TRAP_TYPE_WARN;
}
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index 94be244e8441..d8a5cf66c316 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -321,6 +321,70 @@ int kstrtos8(const char *s, unsigned int base, s8 *res)
}
EXPORT_SYMBOL(kstrtos8);
+/**
+ * kstrtobool - convert common user inputs into boolean values
+ * @s: input string
+ * @res: result
+ *
+ * This routine returns 0 iff the first character is one of 'Yy1Nn0', or
+ * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value
+ * pointed to by res is updated upon finding a match.
+ */
+int kstrtobool(const char *s, bool *res)
+{
+ if (!s)
+ return -EINVAL;
+
+ switch (s[0]) {
+ case 'y':
+ case 'Y':
+ case '1':
+ *res = true;
+ return 0;
+ case 'n':
+ case 'N':
+ case '0':
+ *res = false;
+ return 0;
+ case 'o':
+ case 'O':
+ switch (s[1]) {
+ case 'n':
+ case 'N':
+ *res = true;
+ return 0;
+ case 'f':
+ case 'F':
+ *res = false;
+ return 0;
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(kstrtobool);
+
+/*
+ * Since "base" would be a nonsense argument, this open-codes the
+ * _from_user helper instead of using the helper macro below.
+ */
+int kstrtobool_from_user(const char __user *s, size_t count, bool *res)
+{
+ /* Longest string needed to differentiate, newline, terminator */
+ char buf[4];
+
+ count = min(count, sizeof(buf) - 1);
+ if (copy_from_user(buf, s, count))
+ return -EFAULT;
+ buf[count] = '\0';
+ return kstrtobool(buf, res);
+}
+EXPORT_SYMBOL(kstrtobool_from_user);
+
#define kstrto_from_user(f, g, type) \
int f(const char __user *s, size_t count, unsigned int base, type *res) \
{ \
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6b79e9026e24..1624c4117961 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -173,6 +173,41 @@ radix_tree_find_next_bit(const unsigned long *addr,
return size;
}
+#if 0
+static void dump_node(void *slot, int height, int offset)
+{
+ struct radix_tree_node *node;
+ int i;
+
+ if (!slot)
+ return;
+
+ if (height == 0) {
+ pr_debug("radix entry %p offset %d\n", slot, offset);
+ return;
+ }
+
+ node = indirect_to_ptr(slot);
+ pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n",
+ slot, offset, node->tags[0][0], node->tags[1][0],
+ node->tags[2][0], node->path, node->count, node->parent);
+
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
+ dump_node(node->slots[i], height - 1, i);
+}
+
+/* For debug */
+static void radix_tree_dump(struct radix_tree_root *root)
+{
+ pr_debug("radix root: %p height %d rnode %p tags %x\n",
+ root, root->height, root->rnode,
+ root->gfp_mask >> __GFP_BITS_SHIFT);
+ if (!radix_tree_is_indirect_ptr(root->rnode))
+ return;
+ dump_node(root->rnode, root->height, 0);
+}
+#endif
+
/*
* This assumes that the caller has performed appropriate preallocation, and
* that the caller has pinned this thread of control to the current CPU.
@@ -192,6 +227,15 @@ radix_tree_node_alloc(struct radix_tree_root *root)
struct radix_tree_preload *rtp;
/*
+ * Even if the caller has preloaded, try to allocate from the
+ * cache first for the new node to get accounted.
+ */
+ ret = kmem_cache_alloc(radix_tree_node_cachep,
+ gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN);
+ if (ret)
+ goto out;
+
+ /*
* Provided the caller has preloaded here, we will always
* succeed in getting a node here (and never reach
* kmem_cache_alloc)
@@ -208,10 +252,11 @@ radix_tree_node_alloc(struct radix_tree_root *root)
* for debugging.
*/
kmemleak_update_trace(ret);
+ goto out;
}
- if (ret == NULL)
- ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
-
+ ret = kmem_cache_alloc(radix_tree_node_cachep,
+ gfp_mask | __GFP_ACCOUNT);
+out:
BUG_ON(radix_tree_is_indirect_ptr(ret));
return ret;
}
@@ -323,7 +368,8 @@ static inline unsigned long radix_tree_maxindex(unsigned int height)
/*
* Extend a radix tree so it can store key @index.
*/
-static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
+static int radix_tree_extend(struct radix_tree_root *root,
+ unsigned long index, unsigned order)
{
struct radix_tree_node *node;
struct radix_tree_node *slot;
@@ -335,7 +381,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
while (index > radix_tree_maxindex(height))
height++;
- if (root->rnode == NULL) {
+ if ((root->rnode == NULL) && (order == 0)) {
root->height = height;
goto out;
}
@@ -358,9 +404,10 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
node->count = 1;
node->parent = NULL;
slot = root->rnode;
- if (newheight > 1) {
+ if (radix_tree_is_indirect_ptr(slot) && newheight > 1) {
slot = indirect_to_ptr(slot);
slot->parent = node;
+ slot = ptr_to_indirect(slot);
}
node->slots[0] = slot;
node = ptr_to_indirect(node);
@@ -375,6 +422,7 @@ out:
* __radix_tree_create - create a slot in a radix tree
* @root: radix tree root
* @index: index key
+ * @order: index occupies 2^order aligned slots
* @nodep: returns node
* @slotp: returns slot
*
@@ -388,26 +436,29 @@ out:
* Returns -ENOMEM, or 0 for success.
*/
int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
- struct radix_tree_node **nodep, void ***slotp)
+ unsigned order, struct radix_tree_node **nodep,
+ void ***slotp)
{
struct radix_tree_node *node = NULL, *slot;
unsigned int height, shift, offset;
int error;
+ BUG_ON((0 < order) && (order < RADIX_TREE_MAP_SHIFT));
+
/* Make sure the tree is high enough. */
if (index > radix_tree_maxindex(root->height)) {
- error = radix_tree_extend(root, index);
+ error = radix_tree_extend(root, index, order);
if (error)
return error;
}
- slot = indirect_to_ptr(root->rnode);
+ slot = root->rnode;
height = root->height;
- shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+ shift = height * RADIX_TREE_MAP_SHIFT;
offset = 0; /* uninitialised var warning */
- while (height > 0) {
+ while (shift > order) {
if (slot == NULL) {
/* Have to add a child node. */
if (!(slot = radix_tree_node_alloc(root)))
@@ -415,19 +466,38 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
slot->path = height;
slot->parent = node;
if (node) {
- rcu_assign_pointer(node->slots[offset], slot);
+ rcu_assign_pointer(node->slots[offset],
+ ptr_to_indirect(slot));
node->count++;
slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
} else
- rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
- }
+ rcu_assign_pointer(root->rnode,
+ ptr_to_indirect(slot));
+ } else if (!radix_tree_is_indirect_ptr(slot))
+ break;
/* Go a level down */
+ height--;
+ shift -= RADIX_TREE_MAP_SHIFT;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- node = slot;
+ node = indirect_to_ptr(slot);
slot = node->slots[offset];
- shift -= RADIX_TREE_MAP_SHIFT;
- height--;
+ }
+
+ /* Insert pointers to the canonical entry */
+ if ((shift - order) > 0) {
+ int i, n = 1 << (shift - order);
+ offset = offset & ~(n - 1);
+ slot = ptr_to_indirect(&node->slots[offset]);
+ for (i = 0; i < n; i++) {
+ if (node->slots[offset + i])
+ return -EEXIST;
+ }
+
+ for (i = 1; i < n; i++) {
+ rcu_assign_pointer(node->slots[offset + i], slot);
+ node->count++;
+ }
}
if (nodep)
@@ -438,15 +508,16 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
}
/**
- * radix_tree_insert - insert into a radix tree
+ * __radix_tree_insert - insert into a radix tree
* @root: radix tree root
* @index: index key
+ * @order: key covers the 2^order indices around index
* @item: item to insert
*
* Insert an item into the radix tree at position @index.
*/
-int radix_tree_insert(struct radix_tree_root *root,
- unsigned long index, void *item)
+int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
+ unsigned order, void *item)
{
struct radix_tree_node *node;
void **slot;
@@ -454,7 +525,7 @@ int radix_tree_insert(struct radix_tree_root *root,
BUG_ON(radix_tree_is_indirect_ptr(item));
- error = __radix_tree_create(root, index, &node, &slot);
+ error = __radix_tree_create(root, index, order, &node, &slot);
if (error)
return error;
if (*slot != NULL)
@@ -472,7 +543,7 @@ int radix_tree_insert(struct radix_tree_root *root,
return 0;
}
-EXPORT_SYMBOL(radix_tree_insert);
+EXPORT_SYMBOL(__radix_tree_insert);
/**
* __radix_tree_lookup - lookup an item in a radix tree
@@ -523,6 +594,9 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
node = rcu_dereference_raw(*slot);
if (node == NULL)
return NULL;
+ if (!radix_tree_is_indirect_ptr(node))
+ break;
+ node = indirect_to_ptr(node);
shift -= RADIX_TREE_MAP_SHIFT;
height--;
@@ -609,6 +683,9 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
tag_set(slot, tag, offset);
slot = slot->slots[offset];
BUG_ON(slot == NULL);
+ if (!radix_tree_is_indirect_ptr(slot))
+ break;
+ slot = indirect_to_ptr(slot);
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
@@ -648,11 +725,14 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
goto out;
shift = height * RADIX_TREE_MAP_SHIFT;
- slot = indirect_to_ptr(root->rnode);
+ slot = root->rnode;
while (shift) {
if (slot == NULL)
goto out;
+ if (!radix_tree_is_indirect_ptr(slot))
+ break;
+ slot = indirect_to_ptr(slot);
shift -= RADIX_TREE_MAP_SHIFT;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
@@ -728,6 +808,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
if (node == NULL)
return 0;
+ node = indirect_to_ptr(node);
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
if (!tag_get(node, tag, offset))
@@ -735,6 +816,8 @@ int radix_tree_tag_get(struct radix_tree_root *root,
if (height == 1)
return 1;
node = rcu_dereference_raw(node->slots[offset]);
+ if (!radix_tree_is_indirect_ptr(node))
+ return 1;
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
@@ -795,6 +878,7 @@ restart:
node = rnode;
while (1) {
+ struct radix_tree_node *slot;
if ((flags & RADIX_TREE_ITER_TAGGED) ?
!test_bit(offset, node->tags[tag]) :
!node->slots[offset]) {
@@ -825,9 +909,12 @@ restart:
if (!shift)
break;
- node = rcu_dereference_raw(node->slots[offset]);
- if (node == NULL)
+ slot = rcu_dereference_raw(node->slots[offset]);
+ if (slot == NULL)
goto restart;
+ if (!radix_tree_is_indirect_ptr(slot))
+ break;
+ node = indirect_to_ptr(slot);
shift -= RADIX_TREE_MAP_SHIFT;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
}
@@ -925,15 +1012,20 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
if (!tag_get(slot, iftag, offset))
goto next;
if (shift) {
- /* Go down one level */
- shift -= RADIX_TREE_MAP_SHIFT;
node = slot;
slot = slot->slots[offset];
- continue;
+ if (radix_tree_is_indirect_ptr(slot)) {
+ slot = indirect_to_ptr(slot);
+ shift -= RADIX_TREE_MAP_SHIFT;
+ continue;
+ } else {
+ slot = node;
+ node = node->parent;
+ }
}
/* tag the leaf */
- tagged++;
+ tagged += 1 << shift;
tag_set(slot, settag, offset);
/* walk back up the path tagging interior nodes */
@@ -1181,10 +1273,20 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
goto out;
}
- shift -= RADIX_TREE_MAP_SHIFT;
slot = rcu_dereference_raw(slot->slots[i]);
if (slot == NULL)
goto out;
+ if (!radix_tree_is_indirect_ptr(slot)) {
+ if (slot == item) {
+ *found_index = index + i;
+ index = 0;
+ } else {
+ index += shift;
+ }
+ goto out;
+ }
+ slot = indirect_to_ptr(slot);
+ shift -= RADIX_TREE_MAP_SHIFT;
}
/* Bottom level: check items */
@@ -1264,11 +1366,13 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
/*
* The candidate node has more than one child, or its child
- * is not at the leftmost slot, we cannot shrink.
+ * is not at the leftmost slot, or it is a multiorder entry,
+ * we cannot shrink.
*/
if (to_free->count != 1)
break;
- if (!to_free->slots[0])
+ slot = to_free->slots[0];
+ if (!slot)
break;
/*
@@ -1278,8 +1382,11 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
* (to_free->slots[0]), it will be safe to dereference the new
* one (root->rnode) as far as dependent read barriers go.
*/
- slot = to_free->slots[0];
if (root->height > 1) {
+ if (!radix_tree_is_indirect_ptr(slot))
+ break;
+
+ slot = indirect_to_ptr(slot);
slot->parent = NULL;
slot = ptr_to_indirect(slot);
}
@@ -1377,7 +1484,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
unsigned long index, void *item)
{
struct radix_tree_node *node;
- unsigned int offset;
+ unsigned int offset, i;
void **slot;
void *entry;
int tag;
@@ -1406,6 +1513,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
radix_tree_tag_clear(root, index, tag);
}
+ /* Delete any sibling slots pointing to this slot */
+ for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[offset + i] != ptr_to_indirect(slot))
+ break;
+ node->slots[offset + i] = NULL;
+ node->count--;
+ }
node->slots[offset] = NULL;
node->count--;
diff --git a/lib/string.c b/lib/string.c
index 0323c0d5629a..ed83562a53ae 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -631,33 +631,30 @@ bool sysfs_streq(const char *s1, const char *s2)
EXPORT_SYMBOL(sysfs_streq);
/**
- * strtobool - convert common user inputs into boolean values
- * @s: input string
- * @res: result
+ * match_string - matches given string in an array
+ * @array: array of strings
+ * @n: number of strings in the array or -1 for NULL terminated arrays
+ * @string: string to match with
*
- * This routine returns 0 iff the first character is one of 'Yy1Nn0'.
- * Otherwise it will return -EINVAL. Value pointed to by res is
- * updated upon finding a match.
- */
-int strtobool(const char *s, bool *res)
-{
- switch (s[0]) {
- case 'y':
- case 'Y':
- case '1':
- *res = true;
- break;
- case 'n':
- case 'N':
- case '0':
- *res = false;
- break;
- default:
- return -EINVAL;
+ * Return:
+ * index of a @string in the @array if matches, or %-EINVAL otherwise.
+ */
+int match_string(const char * const *array, size_t n, const char *string)
+{
+ int index;
+ const char *item;
+
+ for (index = 0; index < n; index++) {
+ item = array[index];
+ if (!item)
+ break;
+ if (!strcmp(item, string))
+ return index;
}
- return 0;
+
+ return -EINVAL;
}
-EXPORT_SYMBOL(strtobool);
+EXPORT_SYMBOL(match_string);
#ifndef __HAVE_ARCH_MEMSET
/**
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 525c8e19bda2..ccb664b54280 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2640,8 +2640,12 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
if (*fmt == '*') {
if (!*str)
break;
- while (!isspace(*fmt) && *fmt != '%' && *fmt)
+ while (!isspace(*fmt) && *fmt != '%' && *fmt) {
+ /* '%*[' not yet supported, invalid format */
+ if (*fmt == '[')
+ return num;
fmt++;
+ }
while (!isspace(*str) && *str)
str++;
continue;
@@ -2714,6 +2718,59 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
num++;
}
continue;
+ /*
+ * Warning: This implementation of the '[' conversion specifier
+ * deviates from its glibc counterpart in the following ways:
+ * (1) It does NOT support ranges i.e. '-' is NOT a special
+ * character
+ * (2) It cannot match the closing bracket ']' itself
+ * (3) A field width is required
+ * (4) '%*[' (discard matching input) is currently not supported
+ *
+ * Example usage:
+ * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]",
+ * buf1, buf2, buf3);
+ * if (ret < 3)
+ * // etc..
+ */
+ case '[':
+ {
+ char *s = (char *)va_arg(args, char *);
+ DECLARE_BITMAP(set, 256) = {0};
+ unsigned int len = 0;
+ bool negate = (*fmt == '^');
+
+ /* field width is required */
+ if (field_width == -1)
+ return num;
+
+ if (negate)
+ ++fmt;
+
+ for ( ; *fmt && *fmt != ']'; ++fmt, ++len)
+ set_bit((u8)*fmt, set);
+
+ /* no ']' or no character set found */
+ if (!*fmt || !len)
+ return num;
+ ++fmt;
+
+ if (negate) {
+ bitmap_complement(set, set, 256);
+ /* exclude null '\0' byte */
+ clear_bit(0, set);
+ }
+
+ /* match must be non-empty */
+ if (!test_bit((u8)*str, set))
+ return num;
+
+ while (test_bit((u8)*str, set) && field_width--)
+ *s++ = *str++;
+ *s = '\0';
+ ++num;
+ }
+ continue;
case 'o':
base = 8;
break;
diff --git a/mm/Kconfig b/mm/Kconfig
index 03cbfa072f42..05efa6a5199e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -187,7 +187,6 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -652,10 +651,9 @@ config IDLE_PAGE_TRACKING
config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support" if EXPERT
- default !ZONE_DMA
- depends on !ZONE_DMA
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
depends on X86_64 #arch_add_memory() comprehends device memory
help
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5c50b238b770..22f4cd96acb0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -79,3 +79,16 @@ config PAGE_POISONING_ZERO
Enabling page poisoning with this option will disable hibernation
If unsure, say N
+ bool
+
+config DEBUG_PAGE_REF
+ bool "Enable tracepoint to track down page reference manipulation"
+ depends on DEBUG_KERNEL
+ depends on TRACEPOINTS
+ ---help---
+ This is a feature to add tracepoint for tracking down page reference
+ manipulation. This tracking is useful to diagnose functional failure
+ due to migration failures caused by page reference mismatches. Be
+ careful when enabling this feature because it adds about 30 KB to the
+ kernel code. However the runtime performance overhead is virtually
+ nil until the tracepoints are actually enabled.
diff --git a/mm/Makefile b/mm/Makefile
index cfdd481d27a5..6da300a1414b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -81,3 +81,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c554d173a65f..bfbd7096b6ed 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1026,8 +1026,8 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write,
if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
return -EFAULT;
- printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
- table->procname);
+ pr_warn_once("%s exported in /proc is scheduled for removal\n",
+ table->procname);
*lenp = 2;
*ppos += *lenp;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 91e32bc8517f..0aa7dda52402 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -50,8 +50,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
#define bdebug(fmt, args...) ({ \
if (unlikely(bootmem_debug)) \
- printk(KERN_INFO \
- "bootmem::%s " fmt, \
+ pr_info("bootmem::%s " fmt, \
__func__, ## args); \
})
@@ -680,7 +679,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
/*
* Whoops, we cannot satisfy the allocation request.
*/
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
@@ -755,7 +754,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (ptr)
return ptr;
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 93f71d968098..ccf97b02b85f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,7 @@
*
* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
*/
+#include <linux/cpu.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
@@ -17,6 +18,8 @@
#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
#include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -1188,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/*
* Mark that the PG_migrate_skip information should be cleared
- * by kswapd when it goes to sleep. kswapd does not set the
+ * by kswapd when it goes to sleep. kcompactd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
- if (!current_is_kswapd())
+ if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
return COMPACT_COMPLETE;
@@ -1335,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
/*
* Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
+ * is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
/*
@@ -1474,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
.mode = mode,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
+ .direct_compaction = true,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1736,4 +1739,223 @@ void compaction_unregister_node(struct node *node)
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+ return pgdat->kcompactd_max_order > 0;
+}
+
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+
+ for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+ classzone_idx) == COMPACT_CONTINUE)
+ return true;
+ }
+
+ return false;
+}
+
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+ /*
+ * With no special task, compact all zones so that a page of requested
+ * order is allocatable.
+ */
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = pgdat->kcompactd_max_order,
+ .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+
+ };
+ bool success = false;
+
+ trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+ cc.classzone_idx);
+ count_vm_event(KCOMPACTD_WAKE);
+
+ for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
+ int status;
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_deferred(zone, cc.order))
+ continue;
+
+ if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+ COMPACT_CONTINUE)
+ continue;
+
+ cc.nr_freepages = 0;
+ cc.nr_migratepages = 0;
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ status = compact_zone(zone, &cc);
+
+ if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
+ cc.classzone_idx, 0)) {
+ success = true;
+ compaction_defer_reset(zone, cc.order, false);
+ } else if (status == COMPACT_COMPLETE) {
+ /*
+ * We use sync migration mode here, so we defer like
+ * sync direct compaction does.
+ */
+ defer_compaction(zone, cc.order);
+ }
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ /*
+ * Regardless of success, we are done until woken up next. But remember
+ * the requested order/classzone_idx in case it was higher/tighter than
+ * our current ones
+ */
+ if (pgdat->kcompactd_max_order <= cc.order)
+ pgdat->kcompactd_max_order = 0;
+ if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+}
+
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ if (!order)
+ return;
+
+ if (pgdat->kcompactd_max_order < order)
+ pgdat->kcompactd_max_order = order;
+
+ if (pgdat->kcompactd_classzone_idx > classzone_idx)
+ pgdat->kcompactd_classzone_idx = classzone_idx;
+
+ if (!waitqueue_active(&pgdat->kcompactd_wait))
+ return;
+
+ if (!kcompactd_node_suitable(pgdat))
+ return;
+
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+ classzone_idx);
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ set_freezable();
+
+ pgdat->kcompactd_max_order = 0;
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+
+ while (!kthread_should_stop()) {
+ trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+ wait_event_freezable(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat));
+
+ kcompactd_do_work(pgdat);
+ }
+
+ return 0;
+}
+
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kcompactd)
+ return 0;
+
+ pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ if (IS_ERR(pgdat->kcompactd)) {
+ pr_err("Failed to start kcompactd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kcompactd);
+ pgdat->kcompactd = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+ struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+
+ if (kcompactd) {
+ kthread_stop(kcompactd);
+ NODE_DATA(nid)->kcompactd = NULL;
+ }
+}
+
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ int nid;
+
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ }
+ }
+ return NOTIFY_OK;
+}
+
+static int __init kcompactd_init(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ kcompactd_run(nid);
+ hotcpu_notifier(cpu_callback, 0);
+ return 0;
+}
+subsys_initcall(kcompactd_init)
+
#endif /* CONFIG_COMPACTION */
diff --git a/mm/debug.c b/mm/debug.c
index df7247b0b532..8865bfb41b0b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -43,7 +43,7 @@ const struct trace_print_flags vmaflag_names[] = {
void __dump_page(struct page *page, const char *reason)
{
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
- page, atomic_read(&page->_count), page_mapcount(page),
+ page, page_ref_count(page), page_mapcount(page),
page->mapping, page->index);
if (PageCompound(page))
pr_cont(" compound_mapcount: %d", compound_mapcount(page));
diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c
new file mode 100644
index 000000000000..1aef3d562e52
--- /dev/null
+++ b/mm/debug_page_ref.c
@@ -0,0 +1,54 @@
+#include <linux/mm_types.h>
+#include <linux/tracepoint.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_ref.h>
+
+void __page_ref_set(struct page *page, int v)
+{
+ trace_page_ref_set(page, v);
+}
+EXPORT_SYMBOL(__page_ref_set);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_set);
+
+void __page_ref_mod(struct page *page, int v)
+{
+ trace_page_ref_mod(page, v);
+}
+EXPORT_SYMBOL(__page_ref_mod);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod);
+
+void __page_ref_mod_and_test(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_test(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_test);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test);
+
+void __page_ref_mod_and_return(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_return(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_return);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return);
+
+void __page_ref_mod_unless(struct page *page, int v, int u)
+{
+ trace_page_ref_mod_unless(page, v, u);
+}
+EXPORT_SYMBOL(__page_ref_mod_unless);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless);
+
+void __page_ref_freeze(struct page *page, int v, int ret)
+{
+ trace_page_ref_freeze(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_freeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze);
+
+void __page_ref_unfreeze(struct page *page, int v)
+{
+ trace_page_ref_unfreeze(page, v);
+}
+EXPORT_SYMBOL(__page_ref_unfreeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 57312b5d6e12..abcbfe86c25a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -294,8 +294,7 @@ void dma_pool_destroy(struct dma_pool *pool)
"dma_pool_destroy %s, %p busy\n",
pool->name, page->vaddr);
else
- printk(KERN_ERR
- "dma_pool_destroy %s, %p busy\n",
+ pr_err("dma_pool_destroy %s, %p busy\n",
pool->name, page->vaddr);
/* leak the still-in-use consistent memory */
list_del(&page->page_list);
@@ -424,7 +423,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
"dma_pool_free %s, %p/%lx (bad dma)\n",
pool->name, vaddr, (unsigned long)dma);
else
- printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pr_err("dma_pool_free %s, %p/%lx (bad dma)\n",
pool->name, vaddr, (unsigned long)dma);
return;
}
@@ -438,8 +437,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
pool->name, vaddr, (unsigned long long)dma);
else
- printk(KERN_ERR
- "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n",
pool->name, vaddr, (unsigned long long)dma);
return;
}
@@ -452,13 +450,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
- "already free\n", pool->name,
- (unsigned long long)dma);
+ dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n",
+ pool->name, (unsigned long long)dma);
else
- printk(KERN_ERR "dma_pool_free %s, dma %Lx "
- "already free\n", pool->name,
- (unsigned long long)dma);
+ pr_err("dma_pool_free %s, dma %Lx already free\n",
+ pool->name, (unsigned long long)dma);
return;
}
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 61b441b191ad..7c00f105845e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -586,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;
- error = __radix_tree_create(&mapping->page_tree, page->index,
+ error = __radix_tree_create(&mapping->page_tree, page->index, 0,
&node, &slot);
if (error)
return error;
@@ -1255,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1263,8 +1262,10 @@ repeat:
if (unlikely(!page))
continue;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
/*
* A shadow entry of a recently evicted page, a swap
* entry from shmem/tmpfs or a DAX entry. Return it
@@ -1317,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1327,13 +1327,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- WARN_ON(iter.index);
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1384,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
struct page *page;
repeat:
@@ -1395,12 +1389,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1460,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *page;
@@ -1471,12 +1460,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page.
@@ -1539,7 +1524,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, start, tag) {
struct page *page;
@@ -1549,12 +1533,8 @@ repeat:
continue;
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
@@ -2171,10 +2151,11 @@ repeat:
if (unlikely(!page))
goto next;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- break;
- else
- goto next;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ goto next;
}
if (!page_cache_get_speculative(page))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1ea21e203a70..021db1781872 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -78,7 +78,7 @@ unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
@@ -168,8 +168,7 @@ static void set_recommended_min_free_kbytes(void)
if (recommended_min > min_free_kbytes) {
if (user_min_free_kbytes >= 0)
- pr_info("raising min_free_kbytes from %d to %lu "
- "to help transparent hugepage allocations\n",
+ pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
min_free_kbytes, recommended_min);
min_free_kbytes = recommended_min;
@@ -270,37 +269,35 @@ static struct shrinker huge_zero_page_shrinker = {
#ifdef CONFIG_SYSFS
-static ssize_t double_flag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf,
- enum transparent_hugepage_flag enabled,
- enum transparent_hugepage_flag req_madv)
-{
- if (test_bit(enabled, &transparent_hugepage_flags)) {
- VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
- return sprintf(buf, "[always] madvise never\n");
- } else if (test_bit(req_madv, &transparent_hugepage_flags))
- return sprintf(buf, "always [madvise] never\n");
- else
- return sprintf(buf, "always madvise [never]\n");
-}
-static ssize_t double_flag_store(struct kobject *kobj,
+static ssize_t triple_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag deferred,
enum transparent_hugepage_flag req_madv)
{
- if (!memcmp("always", buf,
+ if (!memcmp("defer", buf,
+ min(sizeof("defer")-1, count))) {
+ if (enabled == deferred)
+ return -EINVAL;
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ set_bit(deferred, &transparent_hugepage_flags);
+ } else if (!memcmp("always", buf,
min(sizeof("always")-1, count))) {
- set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
clear_bit(req_madv, &transparent_hugepage_flags);
+ set_bit(enabled, &transparent_hugepage_flags);
} else if (!memcmp("madvise", buf,
min(sizeof("madvise")-1, count))) {
clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
set_bit(req_madv, &transparent_hugepage_flags);
} else if (!memcmp("never", buf,
min(sizeof("never")-1, count))) {
clear_bit(enabled, &transparent_hugepage_flags);
clear_bit(req_madv, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
} else
return -EINVAL;
@@ -310,17 +307,22 @@ static ssize_t double_flag_store(struct kobject *kobj,
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return double_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_FLAG,
- TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+ if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] madvise never\n");
+ else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
}
+
static ssize_t enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
ssize_t ret;
- ret = double_flag_store(kobj, attr, buf, count,
+ ret = triple_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
@@ -378,16 +380,23 @@ static ssize_t single_flag_store(struct kobject *kobj,
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return double_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
- TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] defer madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [defer] madvise never\n");
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer [madvise] never\n");
+ else
+ return sprintf(buf, "always defer madvise [never]\n");
+
}
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- return double_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ return triple_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
}
static struct kobj_attribute defrag_attr =
@@ -843,9 +852,30 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
return 0;
}
-static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+/*
+ * If THP is set to always then directly reclaim/compact as necessary
+ * If set to defer then do no reclaim and defer to khugepaged
+ * If set to madvise and the VMA is flagged then directly reclaim/compact
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+ gfp_t reclaim_flags = 0;
+
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) &&
+ (vma->vm_flags & VM_HUGEPAGE))
+ reclaim_flags = __GFP_DIRECT_RECLAIM;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ reclaim_flags = __GFP_KSWAPD_RECLAIM;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ reclaim_flags = __GFP_DIRECT_RECLAIM;
+
+ return GFP_TRANSHUGE | reclaim_flags;
+}
+
+/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
+static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
- return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
+ return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
}
/* Caller must hold page table lock. */
@@ -919,7 +949,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
return ret;
}
- gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ gfp = alloc_hugepage_direct_gfpmask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
@@ -1279,7 +1309,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ huge_gfp = alloc_hugepage_direct_gfpmask(vma);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
@@ -2249,11 +2279,12 @@ static int khugepaged_find_target_node(void)
return 0;
}
-static inline struct page *alloc_hugepage(int defrag)
+static inline struct page *alloc_khugepaged_hugepage(void)
{
struct page *page;
- page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+ page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+ HPAGE_PMD_ORDER);
if (page)
prep_transhuge_page(page);
return page;
@@ -2264,7 +2295,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
struct page *hpage;
do {
- hpage = alloc_hugepage(khugepaged_defrag());
+ hpage = alloc_khugepaged_hugepage();
if (!hpage) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
if (!*wait)
@@ -2335,8 +2366,7 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* Only allocate from the target node */
- gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
- __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
@@ -2857,7 +2887,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
- atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ page_ref_add(page, HPAGE_PMD_NR - 1);
write = pmd_write(*pmd);
young = pmd_young(*pmd);
dirty = pmd_dirty(*pmd);
@@ -2947,44 +2977,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
unsigned long haddr = address & HPAGE_PMD_MASK;
mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
ptl = pmd_lock(mm, pmd);
if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
+ struct page *page = pmd_page(*pmd);
if (PageMlocked(page))
- get_page(page);
- else
- page = NULL;
+ clear_page_mlock(page);
} else if (!pmd_devmap(*pmd))
goto out;
- __split_huge_pmd_locked(vma, pmd, haddr, false);
+ __split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
- if (page) {
- lock_page(page);
- munlock_vma_page(page);
- unlock_page(page);
- put_page(page);
- }
}
-static void split_huge_pmd_address(struct vm_area_struct *vma,
- unsigned long address)
+void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
+ bool freeze, struct page *page)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
-
pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
@@ -2996,11 +3015,20 @@ static void split_huge_pmd_address(struct vm_area_struct *vma,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
return;
+
+ /*
+ * If caller asks to setup a migration entries, we need a page to check
+ * pmd against. Otherwise we can end up replacing wrong page.
+ */
+ VM_BUG_ON(freeze && !page);
+ if (page && page != pmd_page(*pmd))
+ return;
+
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_pmd(vma, pmd, address);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3016,7 +3044,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, start);
+ split_huge_pmd_address(vma, start, false, NULL);
/*
* If the new end address isn't hpage aligned and it could
@@ -3026,7 +3054,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, end);
+ split_huge_pmd_address(vma, end, false, NULL);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
@@ -3040,184 +3068,36 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_pmd_address(next, nstart);
+ split_huge_pmd_address(next, nstart, false, NULL);
}
}
-static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static void freeze_page(struct page *page)
{
- unsigned long haddr = address & HPAGE_PMD_MASK;
- spinlock_t *ptl;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return;
- pmd = pmd_offset(pud, address);
- ptl = pmd_lock(vma->vm_mm, pmd);
- if (!pmd_present(*pmd)) {
- spin_unlock(ptl);
- return;
- }
- if (pmd_trans_huge(*pmd)) {
- if (page == pmd_page(*pmd))
- __split_huge_pmd_locked(vma, pmd, haddr, true);
- spin_unlock(ptl);
- return;
- }
- spin_unlock(ptl);
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- pte_t entry, swp_pte;
- swp_entry_t swp_entry;
-
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non PMD-aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!pte_present(*pte))
- continue;
- if (page_to_pfn(page) != pte_pfn(*pte))
- continue;
- flush_cache_page(vma, address, page_to_pfn(page));
- entry = ptep_clear_flush(vma, address, pte);
- if (pte_dirty(entry))
- SetPageDirty(page);
- swp_entry = make_migration_entry(page, pte_write(entry));
- swp_pte = swp_entry_to_pte(swp_entry);
- if (pte_soft_dirty(entry))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(vma->vm_mm, address, pte, swp_pte);
- page_remove_rmap(page, false);
- put_page(page);
- }
- pte_unmap_unlock(pte - 1, ptl);
-}
-
-static void freeze_page(struct anon_vma *anon_vma, struct page *page)
-{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
+ enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
+ TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
+ int i, ret;
VM_BUG_ON_PAGE(!PageHead(page), page);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
- pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
-
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- freeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
-}
-
-static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
-{
- spinlock_t *ptl;
- pmd_t *pmd;
- pte_t *pte, entry;
- swp_entry_t swp_entry;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non-PMD aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!is_swap_pte(*pte))
- continue;
-
- swp_entry = pte_to_swp_entry(*pte);
- if (!is_migration_entry(swp_entry))
- continue;
- if (migration_entry_to_page(swp_entry) != page)
- continue;
-
- get_page(page);
- page_add_anon_rmap(page, vma, address, false);
-
- entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
- if (PageDirty(page))
- entry = pte_mkdirty(entry);
- if (is_write_migration_entry(swp_entry))
- entry = maybe_mkwrite(entry, vma);
-
- flush_dcache_page(page);
- set_pte_at(vma->vm_mm, address, pte, entry);
+ /* We only need TTU_SPLIT_HUGE_PMD once */
+ ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
+ for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
+ /* Cut short if the page is unmapped */
+ if (page_count(page) == 1)
+ return;
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, pte);
+ ret = try_to_unmap(page + i, ttu_flags);
}
- pte_unmap_unlock(pte - 1, ptl);
+ VM_BUG_ON(ret);
}
-static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+static void unfreeze_page(struct page *page)
{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
-
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
- pgoff, pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
+ int i;
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- unfreeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ remove_migration_ptes(page + i, page + i, true);
}
static void __split_huge_page_tail(struct page *head, int tail,
@@ -3226,7 +3106,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
struct page *page_tail = head + tail;
VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
- VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+ VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
/*
* tail_page->_count is zero and not changing from under us. But
@@ -3239,7 +3119,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* atomic_set() here would be safe on all archs (and not only on x86),
* it's safer to use atomic_inc().
*/
- atomic_inc(&page_tail->_count);
+ page_ref_inc(page_tail);
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3295,7 +3175,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
ClearPageCompound(head);
spin_unlock_irq(&zone->lru_lock);
- unfreeze_page(page_anon_vma(head), head);
+ unfreeze_page(head);
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
@@ -3391,7 +3271,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
mlocked = PageMlocked(page);
- freeze_page(anon_vma, head);
+ freeze_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -3420,7 +3300,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
BUG();
} else {
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- unfreeze_page(anon_vma, head);
+ unfreeze_page(head);
ret = -EBUSY;
}
@@ -3455,6 +3335,7 @@ void deferred_split_huge_page(struct page *page)
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
+ count_vm_event(THP_DEFERRED_SPLIT_PAGE);
list_add_tail(page_deferred_list(page), &pgdata->split_queue);
pgdata->split_queue_len++;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aefba5a9cc47..06058eaa173b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2665,7 +2665,7 @@ void __init hugetlb_add_hstate(unsigned int order)
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- pr_warning("hugepagesz= specified twice, ignoring\n");
+ pr_warn("hugepagesz= specified twice, ignoring\n");
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -2701,8 +2701,7 @@ static int __init hugetlb_nrpages_setup(char *s)
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- pr_warning("hugepages= specified twice without "
- "interleaving hugepagesz=, ignoring\n");
+ pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
return 1;
}
diff --git a/mm/internal.h b/mm/internal.h
index ad9400d759c8..7449392c6faa 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,11 +38,6 @@
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
-static inline void set_page_count(struct page *page, int v)
-{
- atomic_set(&page->_count, v);
-}
-
extern int __do_page_cache_readahead(struct address_space *mapping,
struct file *filp, pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size);
@@ -64,7 +59,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
static inline void set_page_refcounted(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
+ VM_BUG_ON_PAGE(page_ref_count(page), page);
set_page_count(page, 1);
}
@@ -148,9 +143,6 @@ extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
-#ifdef CONFIG_MEMORY_FAILURE
-extern bool is_free_buddy_page(struct page *page);
-#endif
extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -175,6 +167,7 @@ struct compact_control {
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool direct_compaction; /* False from kcompactd or /proc/... */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
const int alloc_flags; /* alloc flags of a direct compactor */
@@ -393,7 +386,7 @@ extern int mminit_loglevel;
do { \
if (level < mminit_loglevel) { \
if (level <= MMINIT_WARNING) \
- printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
+ pr_warn("mminit::" prefix " " fmt, ##arg); \
else \
printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
} \
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 12f222d0224b..745aa8f36028 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -214,8 +214,7 @@ static void kasan_report_error(struct kasan_access_info *info)
*/
kasan_disable_current();
spin_lock_irqsave(&report_lock, flags);
- pr_err("================================="
- "=================================\n");
+ pr_err("==================================================================\n");
if (info->access_addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
if ((unsigned long)info->access_addr < PAGE_SIZE)
@@ -236,8 +235,7 @@ static void kasan_report_error(struct kasan_access_info *info)
print_address_description(info);
print_shadow_for_address(info->first_bad_addr);
}
- pr_err("================================="
- "=================================\n");
+ pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, flags);
kasan_enable_current();
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 6f4f424037c0..5bf191756a4a 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
if (!shadow) {
if (printk_ratelimit())
- printk(KERN_ERR "kmemcheck: failed to allocate "
- "shadow bitmap\n");
+ pr_err("kmemcheck: failed to allocate shadow bitmap\n");
return;
}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index dcdcadb69533..dd3c23a801b1 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -49,7 +49,7 @@ static int __init kmemleak_test_init(void)
struct test_node *elem;
int i;
- printk(KERN_INFO "Kmemleak testing\n");
+ pr_info("Kmemleak testing\n");
/* make some orphan objects */
pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 25c0ad36fe38..e6429926e957 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -276,7 +276,7 @@ static void kmemleak_disable(void);
* Print a warning and dump the stack trace.
*/
#define kmemleak_warn(x...) do { \
- pr_warning(x); \
+ pr_warn(x); \
dump_stack(); \
kmemleak_warning = 1; \
} while (0)
@@ -543,7 +543,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
- pr_warning("Cannot allocate a kmemleak_object structure\n");
+ pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
return NULL;
}
@@ -596,8 +596,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
else if (parent->pointer + parent->size <= ptr)
link = &parent->rb_node.rb_right;
else {
- kmemleak_stop("Cannot insert 0x%lx into the object "
- "search tree (overlaps existing)\n",
+ kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
ptr);
/*
* No need for parent->lock here since "parent" cannot
@@ -670,8 +669,8 @@ static void delete_object_part(unsigned long ptr, size_t size)
object = find_and_remove_object(ptr, 1);
if (!object) {
#ifdef DEBUG
- kmemleak_warn("Partially freeing unknown object at 0x%08lx "
- "(size %zu)\n", ptr, size);
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+ ptr, size);
#endif
return;
}
@@ -717,8 +716,8 @@ static void paint_ptr(unsigned long ptr, int color)
object = find_and_get_object(ptr, 0);
if (!object) {
- kmemleak_warn("Trying to color unknown object "
- "at 0x%08lx as %s\n", ptr,
+ kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
+ ptr,
(color == KMEMLEAK_GREY) ? "Grey" :
(color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
return;
@@ -764,7 +763,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
if (!area) {
- pr_warning("Cannot allocate a scan area\n");
+ pr_warn("Cannot allocate a scan area\n");
goto out;
}
@@ -1463,8 +1462,8 @@ static void kmemleak_scan(void)
if (new_leaks) {
kmemleak_found_leaks = true;
- pr_info("%d new suspected memory leaks (see "
- "/sys/kernel/debug/kmemleak)\n", new_leaks);
+ pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n",
+ new_leaks);
}
}
@@ -1515,7 +1514,7 @@ static void start_scan_thread(void)
return;
scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
if (IS_ERR(scan_thread)) {
- pr_warning("Failed to create the scan thread\n");
+ pr_warn("Failed to create the scan thread\n");
scan_thread = NULL;
}
}
@@ -1795,8 +1794,7 @@ static void kmemleak_do_cleanup(struct work_struct *work)
if (!kmemleak_found_leaks)
__kmemleak_do_cleanup();
else
- pr_info("Kmemleak disabled without freeing internal data. "
- "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
+ pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n");
}
static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
@@ -1874,8 +1872,8 @@ void __init kmemleak_init(void)
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
if (crt_early_log > ARRAY_SIZE(early_log))
- pr_warning("Early log buffer exceeded (%d), please increase "
- "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
+ pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
+ crt_early_log);
/* the kernel is still in UP mode, so disabling the IRQs is enough */
local_irq_save(flags);
@@ -1960,7 +1958,7 @@ static int __init kmemleak_late_init(void)
dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
&kmemleak_fops);
if (!dentry)
- pr_warning("Failed to create the debugfs kmemleak file\n");
+ pr_warn("Failed to create the debugfs kmemleak file\n");
mutex_lock(&scan_mutex);
start_scan_thread();
mutex_unlock(&scan_mutex);
diff --git a/mm/memblock.c b/mm/memblock.c
index fc7824fa1b42..b570dddb4cb9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -238,8 +238,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
* so we use WARN_ONCE() here to see the stack trace if
* fail happens.
*/
- WARN_ONCE(1, "memblock: bottom-up allocation failed, "
- "memory hotunplug may be affected\n");
+ WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 42882c1e7fce..36db05fa8acb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -638,9 +638,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid,
- unsigned int lru_mask)
+unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
{
unsigned long nr = 0;
int zid;
@@ -1151,12 +1150,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- /* oom_info_lock ensures that parallel ooms do not interleave */
- static DEFINE_MUTEX(oom_info_lock);
struct mem_cgroup *iter;
unsigned int i;
- mutex_lock(&oom_info_lock);
rcu_read_lock();
if (p) {
@@ -1200,7 +1196,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont("\n");
}
- mutex_unlock(&oom_info_lock);
}
/*
@@ -1237,7 +1232,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
return limit;
}
-static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
struct oom_control oc = {
@@ -1315,6 +1310,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
}
unlock:
mutex_unlock(&oom_lock);
+ return chosen;
}
#if MAX_NUMNODES > 1
@@ -2325,9 +2321,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;
- if (!memcg_kmem_online(memcg))
- return 0;
-
ret = try_charge(memcg, gfp, nr_pages);
if (ret)
return ret;
@@ -2346,10 +2339,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
- int ret;
+ int ret = 0;
memcg = get_mem_cgroup_from_mm(current->mm);
- ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (!mem_cgroup_is_root(memcg))
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
css_put(&memcg->css);
return ret;
}
@@ -2719,39 +2713,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-static unsigned long tree_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx)
+static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_stat(iter, idx);
+ memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += mem_cgroup_read_stat(iter, i);
+ }
}
-static unsigned long tree_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx)
+static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_events(iter, idx);
+ memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_EVENTS; i++)
+ events[i] += mem_cgroup_read_events(iter, i);
+ }
}
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- unsigned long val;
+ unsigned long val = 0;
if (mem_cgroup_is_root(memcg)) {
- val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
- val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
- if (swap)
- val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_RSS);
+ if (swap)
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_SWAP);
+ }
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -2817,6 +2820,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
{
int memcg_id;
+ if (cgroup_memory_nokmem)
+ return 0;
+
BUG_ON(memcg->kmemcg_id >= 0);
BUG_ON(memcg->kmem_state);
@@ -2837,24 +2843,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
return 0;
}
-static int memcg_propagate_kmem(struct mem_cgroup *parent,
- struct mem_cgroup *memcg)
-{
- int ret = 0;
-
- mutex_lock(&memcg_limit_mutex);
- /*
- * If the parent cgroup is not kmem-online now, it cannot be
- * onlined after this point, because it has at least one child
- * already.
- */
- if (memcg_kmem_online(parent) ||
- (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
- ret = memcg_online_kmem(memcg);
- mutex_unlock(&memcg_limit_mutex);
- return ret;
-}
-
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
struct cgroup_subsys_state *css;
@@ -2913,10 +2901,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
}
#else
-static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
-{
- return 0;
-}
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
return 0;
@@ -2932,22 +2916,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- int ret = 0;
+ int ret;
mutex_lock(&memcg_limit_mutex);
- /* Top-level cgroup doesn't propagate from root */
- if (!memcg_kmem_online(memcg)) {
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- ret = -EBUSY;
- if (ret)
- goto out;
- ret = memcg_online_kmem(memcg);
- if (ret)
- goto out;
- }
ret = page_counter_limit(&memcg->kmem, limit);
-out:
mutex_unlock(&memcg_limit_mutex);
return ret;
}
@@ -4198,7 +4170,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- error = memcg_propagate_kmem(parent, memcg);
+ error = memcg_online_kmem(memcg);
if (error)
goto fail;
@@ -4282,9 +4254,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
- mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
- memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
memcg->low = 0;
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -5015,6 +4989,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long nr_pages;
unsigned long high;
int err;
@@ -5025,6 +5000,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
memcg->high = high;
+ nr_pages = page_counter_read(&memcg->memory);
+ if (nr_pages > high)
+ try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+ GFP_KERNEL, true);
+
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -5046,6 +5026,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+ bool drained = false;
unsigned long max;
int err;
@@ -5054,9 +5036,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;
- err = mem_cgroup_resize_limit(memcg, max);
- if (err)
- return err;
+ xchg(&memcg->memory.limit, max);
+
+ for (;;) {
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+
+ if (nr_pages <= max)
+ break;
+
+ if (signal_pending(current)) {
+ err = -EINTR;
+ break;
+ }
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ if (nr_reclaims) {
+ if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
+ GFP_KERNEL, true))
+ nr_reclaims--;
+ continue;
+ }
+
+ mem_cgroup_events(memcg, MEMCG_OOM, 1);
+ if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+ break;
+ }
memcg_wb_domain_size_changed(memcg);
return nbytes;
@@ -5077,6 +5086,8 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long stat[MEMCG_NR_STAT];
+ unsigned long events[MEMCG_NR_EVENTS];
int i;
/*
@@ -5090,22 +5101,27 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
+ tree_stat(memcg, stat);
+ tree_events(memcg, events);
+
seq_printf(m, "anon %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+ seq_printf(m, "kernel_stack %llu\n",
+ (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+ seq_printf(m, "slab %llu\n",
+ (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
+ stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
- (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+ (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) {
struct mem_cgroup *mi;
@@ -5117,12 +5133,17 @@ static int memory_stat_show(struct seq_file *m, void *v)
mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
}
+ seq_printf(m, "slab_reclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ seq_printf(m, "slab_unreclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+
/* Accumulated memory events */
seq_printf(m, "pgfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+ events[MEM_CGROUP_EVENTS_PGFAULT]);
seq_printf(m, "pgmajfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+ events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
return 0;
}
@@ -5395,6 +5416,10 @@ static void uncharge_list(struct list_head *page_list)
struct list_head *next;
struct page *page;
+ /*
+ * Note that the list can be a single page->lru; hence the
+ * do-while loop instead of a simple list_for_each_entry().
+ */
next = page_list->next;
do {
unsigned int nr_pages = 1;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 67c30eb993f0..5a544c6c0717 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -184,9 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
struct siginfo si;
int ret;
- printk(KERN_ERR
- "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
+ pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
si.si_addr = (void *)addr;
@@ -209,8 +208,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
}
if (ret < 0)
- printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
- t->comm, t->pid, ret);
+ pr_info("MCE: Error sending signal to %s:%d: %d\n",
+ t->comm, t->pid, ret);
return ret;
}
@@ -290,8 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
} else {
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- printk(KERN_ERR
- "MCE: Out of memory while machine check handling\n");
+ pr_err("MCE: Out of memory while machine check handling\n");
return;
}
}
@@ -336,9 +334,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr_valid == 0) {
- printk(KERN_ERR
- "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
+ pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
force_sig(SIGKILL, tk->tsk);
}
@@ -350,9 +347,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
*/
else if (kill_proc(tk->tsk, tk->addr, trapno,
pfn, page, flags) < 0)
- printk(KERN_ERR
- "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
+ pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
kfree(tk);
@@ -563,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
*/
static int me_unknown(struct page *p, unsigned long pfn)
{
- printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
+ pr_err("MCE %#lx: Unknown page state\n", pfn);
return MF_FAILED;
}
@@ -608,8 +604,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (mapping->a_ops->error_remove_page) {
err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
- printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
- pfn, err);
+ pr_info("MCE %#lx: Failed to punch page: %d\n",
+ pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
pr_info("MCE %#lx: failed to release buffers\n", pfn);
@@ -624,8 +620,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
- pfn);
+ pr_info("MCE %#lx: Failed to invalidate\n", pfn);
}
return ret;
}
@@ -854,8 +849,7 @@ static int page_action(struct page_state *ps, struct page *p,
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count != 0) {
- printk(KERN_ERR
- "MCE %#lx: %s still referenced by %d users\n",
+ pr_err("MCE %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
result = MF_FAILED;
}
@@ -934,8 +928,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
if (PageSwapCache(p)) {
- printk(KERN_ERR
- "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+ pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn);
ttu |= TTU_IGNORE_HWPOISON;
}
@@ -953,8 +946,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
- printk(KERN_INFO
- "MCE %#lx: corrupted page was clean: dropped without side effects\n",
+ pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -972,8 +964,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
- printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n",
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -1040,16 +1032,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
if (!pfn_valid(pfn)) {
- printk(KERN_ERR
- "MCE %#lx: memory outside kernel control\n",
- pfn);
+ pr_err("MCE %#lx: memory outside kernel control\n", pfn);
return -ENXIO;
}
p = pfn_to_page(pfn);
orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
- printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
+ pr_err("MCE %#lx: already hardware poisoned\n", pfn);
return 0;
}
@@ -1180,7 +1170,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
- printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ pr_err("MCE %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
put_hwpoison_page(hpage);
diff --git a/mm/memory.c b/mm/memory.c
index 0e247642ed5b..ac6bc15c19be 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -562,8 +562,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
- pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
@@ -661,9 +660,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
return;
}
if (nr_unshown) {
- printk(KERN_ALERT
- "BUG: Bad page map: %lu messages suppressed\n",
- nr_unshown);
+ pr_alert("BUG: Bad page map: %lu messages suppressed\n",
+ nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
@@ -674,15 +672,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
- printk(KERN_ALERT
- "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
- current->comm,
- (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
+ current->comm,
+ (long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
- printk(KERN_ALERT
- "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
- (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+ pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+ (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
@@ -3419,12 +3415,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * Use pte_alloc() instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(pmd_none(*pmd)) &&
- unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pte_alloc(mm, pmd, address)))
return VM_FAULT_OOM;
/*
* If a huge pmd materialized under us just retry later. Use
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 24ea06393816..aa34431c3f31 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -33,6 +33,7 @@
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
+#include <linux/compaction.h>
#include <asm/tlbflush.h>
@@ -166,7 +167,7 @@ void get_page_bootmem(unsigned long info, struct page *page,
page->lru.next = (struct list_head *) type;
SetPagePrivate(page);
set_page_private(page, info);
- atomic_inc(&page->_count);
+ page_ref_inc(page);
}
void put_page_bootmem(struct page *page)
@@ -177,7 +178,7 @@ void put_page_bootmem(struct page *page)
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
- if (atomic_dec_return(&page->_count) == 1) {
+ if (page_ref_dec_return(page) == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
@@ -1054,14 +1055,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = pfn_to_nid(pfn);
+ nid = zone_to_nid(zone);
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
- if (ret) {
- memory_notify(MEM_CANCEL_ONLINE, &arg);
- return ret;
- }
+ if (ret)
+ goto failed_addition;
+
/*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
@@ -1079,12 +1079,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
mutex_unlock(&zonelists_mutex);
- printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
- (unsigned long long) pfn << PAGE_SHIFT,
- (((unsigned long long) pfn + nr_pages)
- << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
- return ret;
+ goto failed_addition;
}
zone->present_pages += onlined_pages;
@@ -1094,7 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
pgdat_resize_unlock(zone->zone_pgdat, &flags);
if (onlined_pages) {
- node_states_set_node(zone_to_nid(zone), &arg);
+ node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL, NULL);
else
@@ -1105,8 +1100,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
init_per_zone_wmark_min();
- if (onlined_pages)
- kswapd_run(zone_to_nid(zone));
+ if (onlined_pages) {
+ kswapd_run(nid);
+ kcompactd_run(nid);
+ }
vm_total_pages = nr_free_pagecache_pages();
@@ -1115,6 +1112,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
return 0;
+
+failed_addition:
+ pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) pfn << PAGE_SHIFT,
+ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ return ret;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1526,8 +1530,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
} else {
#ifdef CONFIG_DEBUG_VM
- printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
- pfn);
+ pr_alert("removing pfn %lx from LRU failed\n", pfn);
dump_page(page, "failed to remove from LRU");
#endif
put_page(page);
@@ -1855,7 +1858,7 @@ repeat:
ret = -EBUSY;
goto failed_removal;
}
- printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
+ pr_info("Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
@@ -1880,8 +1883,10 @@ repeat:
zone_pcp_update(zone);
node_states_clear_node(node, &arg);
- if (arg.status_change_nid >= 0)
+ if (arg.status_change_nid >= 0) {
kswapd_stop(node);
+ kcompactd_stop(node);
+ }
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
@@ -1890,9 +1895,9 @@ repeat:
return 0;
failed_removal:
- printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
- (unsigned long long) start_pfn << PAGE_SHIFT,
- ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+ pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
@@ -1965,8 +1970,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
- pr_warn("removing memory fails, because memory "
- "[%pa-%pa] is onlined\n",
+ pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8cbc74387df3..b25de27b83d0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2559,9 +2559,7 @@ static void __init check_numabalancing_enable(void)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
- pr_info("%s automatic NUMA balancing. "
- "Configure with numa_balancing= or the "
- "kernel.numa_balancing sysctl",
+ pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 7924f4f58a6d..07c383ddbbab 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -310,25 +310,36 @@ EXPORT_SYMBOL(mempool_resize);
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
- * Note: using __GFP_ZERO is not supported.
+ * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported.
*/
-void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
wait_queue_t wait;
gfp_t gfp_temp;
+ /* If oom killed, memory reserves are essential to prevent livelock */
+ VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC);
+ /* No element size to zero on allocation */
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
+
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
- gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc:
+ if (likely(pool->curr_nr)) {
+ /*
+ * Don't allocate from emergency reserves if there are
+ * elements available. This check is racy, but it will
+ * be rechecked each loop.
+ */
+ gfp_temp |= __GFP_NOMEMALLOC;
+ }
element = pool->alloc(gfp_temp, pool->pool_data);
if (likely(element != NULL))
@@ -352,11 +363,12 @@ repeat_alloc:
* We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately.
*/
- if (gfp_temp != gfp_mask) {
+ if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) {
spin_unlock_irqrestore(&pool->lock, flags);
gfp_temp = gfp_mask;
goto repeat_alloc;
}
+ gfp_temp = gfp_mask;
/* We must not sleep if !__GFP_DIRECT_RECLAIM */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 568284ec75d4..6c822a7b27e0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -172,7 +172,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
else
page_add_file_rmap(new);
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
/* No need to invalidate - it was non-present before */
@@ -187,14 +187,17 @@ out:
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-static void remove_migration_ptes(struct page *old, struct page *new)
+void remove_migration_ptes(struct page *old, struct page *new, bool locked)
{
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
.arg = old,
};
- rmap_walk(new, &rwc);
+ if (locked)
+ rmap_walk_locked(new, &rwc);
+ else
+ rmap_walk(new, &rwc);
}
/*
@@ -349,7 +352,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- if (!page_freeze_refs(page, expected_count)) {
+ if (!page_ref_freeze(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -363,7 +366,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
if (mode == MIGRATE_ASYNC && head &&
!buffer_migrate_lock_buffers(head, mode)) {
- page_unfreeze_refs(page, expected_count);
+ page_ref_unfreeze(page, expected_count);
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -397,7 +400,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
* to one less reference.
* We know this isn't the last reference.
*/
- page_unfreeze_refs(page, expected_count - 1);
+ page_ref_unfreeze(page, expected_count - 1);
spin_unlock(&mapping->tree_lock);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -451,7 +454,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- if (!page_freeze_refs(page, expected_count)) {
+ if (!page_ref_freeze(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -463,7 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
radix_tree_replace_slot(pslot, newpage);
- page_unfreeze_refs(page, expected_count - 1);
+ page_ref_unfreeze(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
@@ -702,7 +705,7 @@ static int writeout(struct address_space *mapping, struct page *page)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(page, page);
+ remove_migration_ptes(page, page, false);
rc = mapping->a_ops->writepage(page, &wbc);
@@ -900,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (page_was_mapped)
remove_migration_ptes(page,
- rc == MIGRATEPAGE_SUCCESS ? newpage : page);
+ rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
out_unlock_both:
unlock_page(newpage);
@@ -1070,7 +1073,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_was_mapped)
remove_migration_ptes(hpage,
- rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
+ rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
unlock_page(new_hpage);
@@ -1773,7 +1776,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
put_page(new_page);
goto out_fail;
}
-
+ /*
+ * We are not sure a pending tlb flush here is for a huge page
+ * mapping or not. Hence use the tlb range variant
+ */
if (mm_tlb_flush_pending(mm))
flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1829,12 +1835,11 @@ fail_putback:
page_add_anon_rmap(new_page, vma, mmun_start, true);
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
if (page_count(page) != 2) {
set_pmd_at(mm, mmun_start, pmd, orig_entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
+ flush_pmd_tlb_range(vma, mmun_start, mmun_end);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(new_page, true);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index fdadf918de76..5b72266b4b03 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -55,13 +55,12 @@ void __init mminit_verify_zonelist(void)
/* Iterate the zonelist */
for_each_zone_zonelist(zone, z, zonelist, zoneid) {
#ifdef CONFIG_NUMA
- printk(KERN_CONT "%d:%s ",
- zone->node, zone->name);
+ pr_cont("%d:%s ", zone->node, zone->name);
#else
- printk(KERN_CONT "0:%s ", zone->name);
+ pr_cont("0:%s ", zone->name);
#endif /* CONFIG_NUMA */
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
}
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 90e3b869a8b9..e06345aafa03 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -37,7 +37,6 @@
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
-#include <linux/sched/sysctl.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
@@ -123,130 +122,6 @@ void vma_set_page_prot(struct vm_area_struct *vma)
}
}
-
-int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-/*
- * Make sure vm_committed_as in one cacheline and not cacheline shared with
- * other variables. It can be updated by several CPUs frequently.
- */
-struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
-
-/*
- * The global memory commitment made in the system can be a metric
- * that can be used to drive ballooning decisions when Linux is hosted
- * as a guest. On Hyper-V, the host implements a policy engine for dynamically
- * balancing memory across competing virtual machines that are hosted.
- * Several metrics drive this policy engine including the guest reported
- * memory commitment.
- */
-unsigned long vm_memory_committed(void)
-{
- return percpu_counter_read_positive(&vm_committed_as);
-}
-EXPORT_SYMBOL_GPL(vm_memory_committed);
-
-/*
- * Check that a process has enough memory to allocate a new virtual
- * mapping. 0 means there is enough memory for the allocation to
- * succeed and -ENOMEM implies there is not.
- *
- * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
- *
- * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
- * Additional code 2002 Jul 20 by Robert Love.
- *
- * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
- *
- * Note this is a helper function intended to be used by LSMs which
- * wish to use this logic.
- */
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
-{
- long free, allowed, reserve;
-
- VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
- -(s64)vm_committed_as_batch * num_online_cpus(),
- "memory commitment underflow");
-
- vm_acct_memory(pages);
-
- /*
- * Sometimes we want to use more memory than we have
- */
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
- return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
- free += global_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
- goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = vm_commit_limit();
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min_t(long, mm->total_vm / 32, reserve);
- }
-
- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
- return 0;
-error:
- vm_unacct_memory(pages);
-
- return -ENOMEM;
-}
-
/*
* Requires inode->i_mapping->i_mmap_rwsem
*/
@@ -2642,9 +2517,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long ret = -EINVAL;
struct file *file;
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
- "See Documentation/vm/remap_file_pages.txt.\n",
- current->comm, current->pid);
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
+ current->comm, current->pid);
if (prot)
return ret;
@@ -3010,8 +2884,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (is_data_mapping(flags) &&
mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
if (ignore_rlimit_data)
- pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
- "%lu. Will be forbidden soon.\n",
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n",
current->comm, current->pid,
(mm->data_vm + npages) << PAGE_SHIFT,
rlimit(RLIMIT_DATA));
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5fbdd367bbed..f4259e496f83 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter <cl@linux.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
diff --git a/mm/mremap.c b/mm/mremap.c
index 8eeba02fc991..3fa0a467df66 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -20,7 +20,6 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
-#include <linux/sched/sysctl.h>
#include <linux/uaccess.h>
#include <linux/mm-arch-hooks.h>
@@ -214,8 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
continue;
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
- if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
- new_pmd, new_addr))
+ if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 99feb2b07fc5..bd05a70f44b9 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -288,7 +288,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
/*
* Whoops, we cannot satisfy the allocation request.
*/
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
@@ -360,7 +360,7 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (ptr)
return ptr;
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index fbf6f0f1d6c9..6402f2715d48 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -33,7 +33,6 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
-#include <linux/sched/sysctl.h>
#include <linux/printk.h>
#include <asm/uaccess.h>
@@ -48,33 +47,11 @@ struct page *mem_map;
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
unsigned long highest_memmap_pfn;
-struct percpu_counter vm_committed_as;
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
-/*
- * The global memory commitment made in the system can be a metric
- * that can be used to drive ballooning decisions when Linux is hosted
- * as a guest. On Hyper-V, the host implements a policy engine for dynamically
- * balancing memory across competing virtual machines that are hosted.
- * Several metrics drive this policy engine including the guest reported
- * memory commitment.
- */
-unsigned long vm_memory_committed(void)
-{
- return percpu_counter_read_positive(&vm_committed_as);
-}
-
-EXPORT_SYMBOL_GPL(vm_memory_committed);
-
EXPORT_SYMBOL(mem_map);
/* list of mapped, potentially shareable regions */
@@ -1829,100 +1806,6 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-/*
- * Check that a process has enough memory to allocate a new virtual
- * mapping. 0 means there is enough memory for the allocation to
- * succeed and -ENOMEM implies there is not.
- *
- * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
- *
- * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
- * Additional code 2002 Jul 20 by Robert Love.
- *
- * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
- *
- * Note this is a helper function intended to be used by LSMs which
- * wish to use this logic.
- */
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
-{
- long free, allowed, reserve;
-
- vm_acct_memory(pages);
-
- /*
- * Sometimes we want to use more memory than we have
- */
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
- return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
- free += global_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
- goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = vm_commit_limit();
- /*
- * Reserve some 3% for root
- */
- if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min_t(long, mm->total_vm / 32, reserve);
- }
-
- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
- return 0;
-
-error:
- vm_unacct_memory(pages);
-
- return -ENOMEM;
-}
-
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e97a05d9621f..06f7e1707847 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -287,9 +287,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && !is_sysrq_oom(oc))
- return OOM_SCAN_ABORT;
-
return OOM_SCAN_OK;
}
@@ -386,8 +383,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, "
- "oom_score_adj=%hd\n",
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
@@ -459,15 +455,11 @@ void exit_oom_victim(void)
bool oom_killer_disable(void)
{
/*
- * Make sure to not race with an ongoing OOM killer
- * and that the current is not the victim.
+ * Make sure to not race with an ongoing OOM killer. Check that the
+ * current is not killed (possibly due to sharing the victim's memory).
*/
- mutex_lock(&oom_lock);
- if (test_thread_flag(TIF_MEMDIE)) {
- mutex_unlock(&oom_lock);
+ if (mutex_lock_killable(&oom_lock))
return false;
- }
-
oom_killer_disabled = true;
mutex_unlock(&oom_lock);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c46b75d14b6f..a762be57e46e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -307,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
{
+ unsigned long max_initialise;
+
/* Always populate low zones for address-contrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
+ /*
+ * Initialise at least 2G of a node but also take into account that
+ * two large system hashes that can take up 1GB for 0.25TB/node.
+ */
+ max_initialise = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
- /* Initialise at least 2G of the highest zone */
(*nr_initialised)++;
- if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+ if ((*nr_initialised > max_initialise) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -498,6 +506,7 @@ void prep_compound_page(struct page *page, unsigned int order)
unsigned int _debug_guardpage_minorder;
bool _debug_pagealloc_enabled __read_mostly
= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
bool _debug_guardpage_enabled __read_mostly;
static int __init early_debug_pagealloc(char *buf)
@@ -542,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
- printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value\n");
return 0;
}
_debug_guardpage_minorder = res;
- printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
@@ -764,7 +773,7 @@ static inline int free_pages_check(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
@@ -1460,7 +1469,7 @@ static inline int check_new_page(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
@@ -2348,19 +2357,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
list_del(&page->lru);
pcp->count--;
} else {
- if (unlikely(gfp_flags & __GFP_NOFAIL)) {
- /*
- * __GFP_NOFAIL is not to be used in new code.
- *
- * All __GFP_NOFAIL callers should be fixed so that they
- * properly detect and handle allocation failures.
- *
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with
- * __GFP_NOFAIL.
- */
- WARN_ON_ONCE(order > 1);
- }
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
page = NULL;
@@ -2857,8 +2858,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
* keep looping as per tradition.
+ *
+ * But do not keep looping if oom_killer_disable()
+ * was already called, for the system is trying to
+ * enter a quiescent state during suspend.
*/
- *did_some_progress = 1;
+ *did_some_progress = !oom_killer_disabled;
goto out;
}
if (pm_suspended_storage())
@@ -3117,14 +3122,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
- /*
- * If this allocation cannot block and it is for a specific node, then
- * fail early. There's no need to wakeup kswapd or retry for a
- * speculative node-specific allocation.
- */
- if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
- goto nopage;
-
retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
@@ -3481,7 +3478,7 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- atomic_add(size - 1, &page->_count);
+ page_ref_add(page, size - 1);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
@@ -3493,7 +3490,7 @@ refill:
if (unlikely(offset < 0)) {
page = virt_to_page(nc->va);
- if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -3501,7 +3498,7 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- atomic_set(&page->_count, size);
+ set_page_count(page, size);
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = size;
@@ -3712,6 +3709,49 @@ static inline void show_node(struct zone *zone)
printk("Node %d ", zone_to_nid(zone));
}
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += zone->watermark[WMARK_LOW];
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping.
+ */
+ available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping. Assume at least half of the page cache, or the
+ * low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab consists of items that are in use,
+ * and cannot be freed. Cap this estimate at the low watermark.
+ */
+ available += global_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
@@ -4044,9 +4084,7 @@ static int __parse_numa_zonelist_order(char *s)
} else if (*s == 'z' || *s == 'Z') {
user_zonelist_order = ZONELIST_ORDER_ZONE;
} else {
- printk(KERN_WARNING
- "Ignoring invalid numa_zonelist_order value: "
- "%s\n", s);
+ pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4510,12 +4548,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. "
- "Total pages: %ld\n",
- nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
- page_group_by_mobility_disabled ? "off" : "on",
- vm_total_pages);
+ pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ nr_online_nodes,
+ zonelist_order_name[current_zonelist_order],
+ page_group_by_mobility_disabled ? "off" : "on",
+ vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
@@ -5404,6 +5441,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
pgdat_page_ext_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -5428,8 +5468,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
- printk(KERN_WARNING
- " %s zone: %lu pages exceeds freesize %lu\n",
+ pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
@@ -5637,8 +5676,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
min_pfn = min(min_pfn, start_pfn);
if (min_pfn == ULONG_MAX) {
- printk(KERN_WARNING
- "Could not find start_pfn for node %d\n", nid);
+ pr_warn("Could not find start_pfn for node %d\n", nid);
return 0;
}
@@ -6110,22 +6148,21 @@ void __init mem_init_print_info(const char *str)
#undef adj_init_size
- pr_info("Memory: %luK/%luK available "
- "(%luK kernel code, %luK rwdata, %luK rodata, "
- "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
+ pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
- ", %luK highmem"
+ ", %luK highmem"
#endif
- "%s%s)\n",
- nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
- codesize >> 10, datasize >> 10, rosize >> 10,
- (init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
- totalcma_pages << (PAGE_SHIFT-10),
+ "%s%s)\n",
+ nr_free_pages() << (PAGE_SHIFT - 10),
+ physpages << (PAGE_SHIFT - 10),
+ codesize >> 10, datasize >> 10, rosize >> 10,
+ (init_data_size + init_code_size) >> 10, bss_size >> 10,
+ (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+ totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
- totalhigh_pages << (PAGE_SHIFT-10),
+ totalhigh_pages << (PAGE_SHIFT - 10),
#endif
- str ? ", " : "", str ? str : "");
+ str ? ", " : "", str ? str : "");
}
/**
@@ -6300,8 +6337,17 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_MIN] = tmp;
}
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ /*
+ * Set the kswapd watermarks distance according to the
+ * scale factor in proportion to available memory, but
+ * ensure a minimum size on small systems.
+ */
+ tmp = max_t(u64, tmp >> 2,
+ mult_frac(zone->managed_pages,
+ watermark_scale_factor, 10000));
+
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6442,6 +6488,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write)
+ setup_per_zone_wmarks();
+
+ return 0;
+}
+
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
@@ -6633,11 +6694,8 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
- tablename,
- (1UL << log2qty),
- ilog2(size) - PAGE_SHIFT,
- size);
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
if (_hash_shift)
*_hash_shift = log2qty;
@@ -6788,7 +6846,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* This check already skips compound tails of THP
* because their page->_count is zero at all time.
*/
- if (!atomic_read(&page->_count)) {
+ if (!page_ref_count(page)) {
if (PageBuddy(page))
iter += (1 << page_order(page)) - 1;
continue;
@@ -7138,8 +7196,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
#ifdef CONFIG_DEBUG_VM
- printk(KERN_INFO "remove from free list %lx %d %lx\n",
- pfn, 1 << order, end_pfn);
+ pr_info("remove from free list %lx %d %lx\n",
+ pfn, 1 << order, end_pfn);
#endif
list_del(&page->lru);
rmv_page_order(page);
@@ -7152,7 +7210,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
}
#endif
-#ifdef CONFIG_MEMORY_FAILURE
bool is_free_buddy_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -7171,4 +7228,3 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
-#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index b995a5ba5e8f..ff74e512f029 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -56,10 +56,10 @@ void end_swap_bio_write(struct bio *bio)
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
*/
set_page_dirty(page);
- printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
@@ -73,10 +73,10 @@ static void end_swap_bio_read(struct bio *bio)
if (bio->bi_error) {
SetPageError(page);
ClearPageUptodate(page);
- printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
goto out;
}
@@ -216,7 +216,7 @@ reprobe:
out:
return ret;
bad_bmap:
- printk(KERN_ERR "swapon: swapfile has holes\n");
+ pr_err("swapon: swapfile has holes\n");
ret = -EINVAL;
goto out;
}
@@ -290,8 +290,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
*/
set_page_dirty(page);
ClearPageReclaim(page);
- pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
- page_file_offset(page));
+ pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
+ page_file_offset(page));
}
end_page_writeback(page);
return ret;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 44ad1f00c4e1..ac3d8d129974 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -198,9 +198,8 @@ void __dump_page_owner(struct page *page)
return;
}
- pr_alert("page allocated via order %u, migratetype %s, "
- "gfp_mask %#x(%pGg)\n", page_ext->order,
- migratetype_names[mt], gfp_mask, &gfp_mask);
+ pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+ page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
print_stack_trace(&trace, 0);
if (page_ext->last_migrate_reason != -1)
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 10e3d0b8a86d..d66911ff42d9 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
/* all units must be in a single group */
if (ai->nr_groups != 1) {
- printk(KERN_CRIT "percpu: can't handle more than one groups\n");
+ pr_crit("can't handle more than one group\n");
return -EINVAL;
}
@@ -103,8 +103,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
alloc_pages = roundup_pow_of_two(nr_pages);
if (alloc_pages > nr_pages)
- printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
- alloc_pages - nr_pages);
+ pr_warn("wasting %zu pages per chunk\n",
+ alloc_pages - nr_pages);
return 0;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 998607adf6eb..0c59684f1ff2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -53,6 +53,8 @@
* setup the first chunk containing the kernel static percpu area
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/err.h>
@@ -888,8 +890,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
size = ALIGN(size, 2);
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
- WARN(true, "illegal size (%zu) or align (%zu) for "
- "percpu allocation\n", size, align);
+ WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
+ size, align);
return NULL;
}
@@ -1033,11 +1035,11 @@ fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
if (!is_atomic && warn_limit) {
- pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
- size, align, is_atomic, err);
+ pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
dump_stack();
if (!--warn_limit)
- pr_info("PERCPU: limit reached, disable warning\n");
+ pr_info("limit reached, disable warning\n");
}
if (is_atomic) {
/* see the flag handling in pcpu_blance_workfn() */
@@ -1449,20 +1451,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
for (alloc_end += gi->nr_units / upa;
alloc < alloc_end; alloc++) {
if (!(alloc % apl)) {
- printk(KERN_CONT "\n");
+ pr_cont("\n");
printk("%spcpu-alloc: ", lvl);
}
- printk(KERN_CONT "[%0*d] ", group_width, group);
+ pr_cont("[%0*d] ", group_width, group);
for (unit_end += upa; unit < unit_end; unit++)
if (gi->cpu_map[unit] != NR_CPUS)
- printk(KERN_CONT "%0*d ", cpu_width,
- gi->cpu_map[unit]);
+ pr_cont("%0*d ",
+ cpu_width, gi->cpu_map[unit]);
else
- printk(KERN_CONT "%s ", empty_str);
+ pr_cont("%s ", empty_str);
}
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
/**
@@ -1538,8 +1540,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
- pr_emerg("PERCPU: failed to initialize, %s", #cond); \
- pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \
+ pr_emerg("failed to initialize, %s\n", #cond); \
+ pr_emerg("cpu_possible_mask=%*pb\n", \
cpumask_pr_args(cpu_possible_mask)); \
pcpu_dump_alloc_info(KERN_EMERG, ai); \
BUG(); \
@@ -1723,7 +1725,7 @@ static int __init percpu_alloc_setup(char *str)
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
else
- pr_warning("PERCPU: unknown allocator %s specified\n", str);
+ pr_warn("unknown allocator %s specified\n", str);
return 0;
}
@@ -2016,9 +2018,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
/* warn if maximum distance is further than 75% of vmalloc space */
if (max_distance > VMALLOC_TOTAL * 3 / 4) {
- pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
- "space 0x%lx\n", max_distance,
- VMALLOC_TOTAL);
+ pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
+ max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
@@ -2026,7 +2027,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
#endif
}
- pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
+ pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
@@ -2100,8 +2101,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
- pr_warning("PERCPU: failed to allocate %s page "
- "for cpu%u\n", psize_str, cpu);
+ pr_warn("failed to allocate %s page for cpu%u\n",
+ psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
@@ -2140,7 +2141,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
}
/* we're ready, commit */
- pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
+ pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
unit_pages, psize_str, vm.addr, ai->static_size,
ai->reserved_size, ai->dyn_size);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 06a005b979a7..71c5f9109f2a 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -84,20 +84,6 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
-
-/*
- * ARCHes with special requirements for evicting THP backing TLB entries can
- * implement this. Otherwise also, it can help optimize normal TLB flush in
- * THP regime. stock flush_tlb_range() typically has optimization to nuke the
- * entire TLB if flush span is greater than a threshold, which will
- * likely be true for a single huge page. Thus a single thp flush will
- * invalidate the entire TLB which is not desirable.
- * e.g. see arch/arc: flush_pmd_tlb_range
- */
-#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
-#endif
-
#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 942212970529..daf6ff6e199a 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -8,7 +8,7 @@
* improved on it.
*
* Copyright (C) 2007 SGI,
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter <cl@linux.com>
* Generalized, added support for multiple lists and
* constructors / destructors.
*/
diff --git a/mm/rmap.c b/mm/rmap.c
index 02f0bfc3c80a..c399a0d41b31 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1431,6 +1431,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
goto out;
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_address(vma, address,
+ flags & TTU_MIGRATION, page);
+ /* check if we have anything to do after split */
+ if (page_mapcount(page) == 0)
+ goto out;
+ }
+
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -1576,10 +1584,10 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return is_vma_temporary_stack(vma);
}
-static int page_not_mapped(struct page *page)
+static int page_mapcount_is_zero(struct page *page)
{
- return !page_mapped(page);
-};
+ return !page_mapcount(page);
+}
/**
* try_to_unmap - try to remove all page table mappings to a page
@@ -1606,12 +1614,10 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = &rp,
- .done = page_not_mapped,
+ .done = page_mapcount_is_zero,
.anon_lock = page_lock_anon_vma_read,
};
- VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
-
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
@@ -1623,9 +1629,12 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
- ret = rmap_walk(page, &rwc);
+ if (flags & TTU_RMAP_LOCKED)
+ ret = rmap_walk_locked(page, &rwc);
+ else
+ ret = rmap_walk(page, &rwc);
- if (ret != SWAP_MLOCK && !page_mapped(page)) {
+ if (ret != SWAP_MLOCK && !page_mapcount(page)) {
ret = SWAP_SUCCESS;
if (rp.lazyfreed && !PageDirty(page))
ret = SWAP_LZFREE;
@@ -1633,6 +1642,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
return ret;
}
+static int page_not_mapped(struct page *page)
+{
+ return !page_mapped(page);
+};
+
/**
* try_to_munlock - try to munlock a page
* @page: the page to be munlocked
@@ -1715,14 +1729,21 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
- anon_vma = rmap_walk_anon_lock(page, rwc);
+ if (locked) {
+ anon_vma = page_anon_vma(page);
+ /* anon_vma disappear under us? */
+ VM_BUG_ON_PAGE(!anon_vma, page);
+ } else {
+ anon_vma = rmap_walk_anon_lock(page, rwc);
+ }
if (!anon_vma)
return ret;
@@ -1742,7 +1763,9 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
if (rwc->done && rwc->done(page))
break;
}
- anon_vma_unlock_read(anon_vma);
+
+ if (!locked)
+ anon_vma_unlock_read(anon_vma);
return ret;
}
@@ -1759,9 +1782,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);
pgoff_t pgoff;
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
@@ -1778,7 +1802,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
return ret;
pgoff = page_to_pgoff(page);
- i_mmap_lock_read(mapping);
+ if (!locked)
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
@@ -1795,7 +1820,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
}
done:
- i_mmap_unlock_read(mapping);
+ if (!locked)
+ i_mmap_unlock_read(mapping);
return ret;
}
@@ -1804,9 +1830,20 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
if (unlikely(PageKsm(page)))
return rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
- return rmap_walk_anon(page, rwc);
+ return rmap_walk_anon(page, rwc, false);
+ else
+ return rmap_walk_file(page, rwc, false);
+}
+
+/* Like rmap_walk, but caller holds relevant rmap lock */
+int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+{
+ /* no ksm support for now */
+ VM_BUG_ON_PAGE(PageKsm(page), page);
+ if (PageAnon(page))
+ return rmap_walk_anon(page, rwc, true);
else
- return rmap_walk_file(page, rwc);
+ return rmap_walk_file(page, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/shmem.c b/mm/shmem.c
index 1acfdbc4bd9e..9428c51ab2d6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -376,28 +376,23 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
if (iter.index >= end)
break;
page = radix_tree_deref_slot(slot);
- /*
- * This should only be possible to happen at index 0, so we
- * don't need to reset the counter, nor do we risk infinite
- * restarts.
- */
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
if (radix_tree_exceptional_entry(page))
swapped++;
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
@@ -1947,12 +1942,13 @@ static void shmem_tag_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
page = radix_tree_deref_slot(slot);
if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
} else if (page_count(page) - page_mapcount(page) > 1) {
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_set(&mapping->page_tree, iter.index,
@@ -1962,8 +1958,7 @@ restart:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2000,14 +1995,15 @@ static int shmem_wait_for_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
start, SHMEM_TAG_PINNED) {
page = radix_tree_deref_slot(slot);
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
page = NULL;
}
@@ -2032,8 +2028,7 @@ restart:
continue_resched:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2823,9 +2818,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if ((value = strchr(this_char,'=')) != NULL) {
*value++ = 0;
} else {
- printk(KERN_ERR
- "tmpfs: No value for mount option '%s'\n",
- this_char);
+ pr_err("tmpfs: No value for mount option '%s'\n",
+ this_char);
goto error;
}
@@ -2880,8 +2874,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (mpol_parse_str(value, &mpol))
goto bad_val;
} else {
- printk(KERN_ERR "tmpfs: Bad mount option %s\n",
- this_char);
+ pr_err("tmpfs: Bad mount option %s\n", this_char);
goto error;
}
}
@@ -2889,7 +2882,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
return 0;
bad_val:
- printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
+ pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
value, this_char);
error:
mpol_put(mpol);
@@ -3286,14 +3279,14 @@ int __init shmem_init(void)
error = register_filesystem(&shmem_fs_type);
if (error) {
- printk(KERN_ERR "Could not register tmpfs\n");
+ pr_err("Could not register tmpfs\n");
goto out2;
}
shm_mnt = kern_mount(&shmem_fs_type);
if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt);
- printk(KERN_ERR "Could not kern_mount tmpfs\n");
+ pr_err("Could not kern_mount tmpfs\n");
goto out1;
}
return 0;
diff --git a/mm/slab.c b/mm/slab.c
index 852fc5c79829..e719a5cb3396 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -474,7 +474,7 @@ static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
static void __slab_error(const char *function, struct kmem_cache *cachep,
char *msg)
{
- printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
+ pr_err("slab error in %s(): cache `%s': %s\n",
function, cachep->name, msg);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -670,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return flags;
+ return flags & ~__GFP_NOFAIL;
}
#else /* CONFIG_NUMA */
@@ -841,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
}
/*
- * Construct gfp mask to allocate from a specific node but do not direct reclaim
- * or warn about failures. kswapd may still wake to reclaim in the background.
+ * Construct gfp mask to allocate from a specific node but do not reclaim or
+ * warn about failures.
*/
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
}
#endif
@@ -1442,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
*/
static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
{
- const unsigned long nr_freed = (1 << cachep->gfporder);
+ int order = cachep->gfporder;
+ unsigned long nr_freed = (1 << order);
- kmemcheck_free_shadow(page, cachep->gfporder);
+ kmemcheck_free_shadow(page, order);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
sub_zone_page_state(page_zone(page),
@@ -1461,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_kmem_pages(page, cachep->gfporder);
+ memcg_uncharge_slab(page, order, cachep);
+ __free_pages(page, order);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -1551,7 +1553,7 @@ static void dump_line(char *data, int offset, int limit)
unsigned char error = 0;
int bad_count = 0;
- printk(KERN_ERR "%03x: ", offset);
+ pr_err("%03x: ", offset);
for (i = 0; i < limit; i++) {
if (data[offset + i] != POISON_FREE) {
error = data[offset + i];
@@ -1564,13 +1566,11 @@ static void dump_line(char *data, int offset, int limit)
if (bad_count == 1) {
error ^= POISON_FREE;
if (!(error & (error - 1))) {
- printk(KERN_ERR "Single bit error detected. Probably "
- "bad RAM.\n");
+ pr_err("Single bit error detected. Probably bad RAM.\n");
#ifdef CONFIG_X86
- printk(KERN_ERR "Run memtest86+ or a similar memory "
- "test tool.\n");
+ pr_err("Run memtest86+ or a similar memory test tool.\n");
#else
- printk(KERN_ERR "Run a memory test tool.\n");
+ pr_err("Run a memory test tool.\n");
#endif
}
}
@@ -1585,13 +1585,13 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
char *realobj;
if (cachep->flags & SLAB_RED_ZONE) {
- printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
- *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ pr_err("Redzone: 0x%llx/0x%llx\n",
+ *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
if (cachep->flags & SLAB_STORE_USER) {
- printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
+ pr_err("Last user: [<%p>](%pSR)\n",
*dbg_userword(cachep, objp),
*dbg_userword(cachep, objp));
}
@@ -1627,9 +1627,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Mismatch ! */
/* Print header */
if (lines == 0) {
- printk(KERN_ERR
- "Slab corruption (%s): %s start=%p, len=%d\n",
- print_tainted(), cachep->name, realobj, size);
+ pr_err("Slab corruption (%s): %s start=%p, len=%d\n",
+ print_tainted(), cachep->name,
+ realobj, size);
print_objinfo(cachep, objp, 0);
}
/* Hexdump the affected line */
@@ -1656,15 +1656,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
if (objnr) {
objp = index_to_obj(cachep, page, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
- printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
- realobj, size);
+ pr_err("Prev obj: start=%p, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
objp = index_to_obj(cachep, page, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
- printk(KERN_ERR "Next obj: start=%p, len=%d\n",
- realobj, size);
+ pr_err("Next obj: start=%p, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
}
@@ -1691,11 +1689,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "start of a freed object "
- "was overwritten");
+ slab_error(cachep, "start of a freed object was overwritten");
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "end of a freed object "
- "was overwritten");
+ slab_error(cachep, "end of a freed object was overwritten");
}
}
}
@@ -2396,11 +2392,9 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " end of an object");
+ slab_error(cachep, "constructor overwrote the end of an object");
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " start of an object");
+ slab_error(cachep, "constructor overwrote the start of an object");
}
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON) {
@@ -2467,8 +2461,8 @@ static void slab_put_obj(struct kmem_cache *cachep,
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
if (get_free_obj(page, i) == objnr) {
- printk(KERN_ERR "slab: double free detected in cache "
- "'%s', objp %p\n", cachep->name, objp);
+ pr_err("slab: double free detected in cache '%s', objp %p\n",
+ cachep->name, objp);
BUG();
}
}
@@ -2587,7 +2581,7 @@ failed:
static void kfree_debugcheck(const void *objp)
{
if (!virt_addr_valid(objp)) {
- printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
+ pr_err("kfree_debugcheck: out of range ptr %lxh\n",
(unsigned long)objp);
BUG();
}
@@ -2611,8 +2605,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
else
slab_error(cache, "memory outside object was overwritten");
- printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
- obj, redzone1, redzone2);
+ pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ obj, redzone1, redzone2);
}
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
@@ -2899,12 +2893,10 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
- slab_error(cachep, "double free, or memory outside"
- " object was overwritten");
- printk(KERN_ERR
- "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
- objp, *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ slab_error(cachep, "double free, or memory outside object was overwritten");
+ pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ objp, *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
@@ -2915,7 +2907,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
cachep->ctor(objp);
if (ARCH_SLAB_MINALIGN &&
((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
- printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
objp, (int)ARCH_SLAB_MINALIGN);
}
return objp;
@@ -3842,7 +3834,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (err)
- printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
+ pr_err("enable_cpucache failed for %s, error %d\n",
cachep->name, -err);
return err;
}
@@ -3998,7 +3990,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
name = cachep->name;
if (error)
- printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
+ pr_err("slab: cache %s error: %s\n", name, error);
sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
@@ -4026,8 +4018,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
unsigned long node_frees = cachep->node_frees;
unsigned long overflows = cachep->node_overflow;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
- "%4lu %4lu %4lu %4lu %4lu",
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
allocs, high, grown,
reaped, errors, max_freeable, node_allocs,
node_frees, overflows);
diff --git a/mm/slab.h b/mm/slab.h
index b7934361f026..ff39a8fc3b3f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -246,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
+ int ret;
+
if (!memcg_kmem_enabled())
return 0;
if (is_root_cache(s))
return 0;
- return __memcg_kmem_charge_memcg(page, gfp, order,
- s->memcg_params.memcg);
+
+ ret = __memcg_kmem_charge_memcg(page, gfp, order,
+ s->memcg_params.memcg);
+ if (ret)
+ return ret;
+
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ return 0;
+}
+
+static __always_inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ memcg_kmem_uncharge(page, order);
}
extern void slab_init_memcg_params(struct kmem_cache *);
@@ -294,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
return 0;
}
+static inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+}
+
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6afb2263a5c5..b2e379639a5b 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -442,7 +442,7 @@ out_unlock:
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
else {
- printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+ pr_warn("kmem_cache_create(%s) failed with error %d\n",
name, err);
dump_stack();
}
@@ -510,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
* The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (!memcg_kmem_online(memcg))
+ if (memcg->kmem_state != KMEM_ONLINE)
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -726,8 +726,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
err = shutdown_cache(s, &release, &need_rcu_barrier);
if (err) {
- pr_err("kmem_cache_destroy %s: "
- "Slab cache still has objects\n", s->name);
+ pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
+ s->name);
dump_stack();
}
out_unlock:
@@ -1047,13 +1047,11 @@ static void print_slabinfo_header(struct seq_file *m)
#else
seq_puts(m, "slabinfo - version: 2.1\n");
#endif
- seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
- "<objperslab> <pagesperslab>");
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#ifdef CONFIG_DEBUG_SLAB
- seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
- "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
diff --git a/mm/slub.c b/mm/slub.c
index 6c91324f9370..7277413ebc8b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -254,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
void *p;
-#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (!debug_pagealloc_enabled())
+ return get_freepointer(s, object);
+
probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
-#else
- p = get_freepointer(s, object);
-#endif
return p;
}
@@ -951,14 +950,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
max_objects = MAX_OBJS_PER_PAGE;
if (page->objects != max_objects) {
- slab_err(s, page, "Wrong number of objects. Found %d but "
- "should be %d", page->objects, max_objects);
+ slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
+ page->objects, max_objects);
page->objects = max_objects;
slab_fix(s, "Number of objects adjusted.");
}
if (page->inuse != page->objects - nr) {
- slab_err(s, page, "Wrong object count. Counter is %d but "
- "counted were %d", page->inuse, page->objects - nr);
+ slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
+ page->inuse, page->objects - nr);
page->inuse = page->objects - nr;
slab_fix(s, "Object count adjusted.");
}
@@ -1118,8 +1117,8 @@ static inline int free_consistency_checks(struct kmem_cache *s,
if (unlikely(s != page->slab_cache)) {
if (!PageSlab(page)) {
- slab_err(s, page, "Attempt to free object(0x%p) "
- "outside of slab", object);
+ slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
+ object);
} else if (!page->slab_cache) {
pr_err("SLUB <none>: no slab for object 0x%p.\n",
object);
@@ -1427,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
- alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
@@ -1540,7 +1539,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_kmem_pages(page, order);
+ memcg_uncharge_slab(page, order, s);
+ __free_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -3439,10 +3439,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
- panic("Cannot create slab %s size=%lu realsize=%u "
- "order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)s->size, s->size,
- oo_order(s->oo), s->offset, flags);
+ panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
+ s->name, (unsigned long)s->size, s->size,
+ oo_order(s->oo), s->offset, flags);
return -EINVAL;
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index b60802b3e5ea..68885dcbaf40 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE)
- printk(KERN_WARNING "[%lx-%lx] potential offnode "
- "page_structs\n", start, end - 1);
+ pr_warn("[%lx-%lx] potential offnode page_structs\n",
+ start, end - 1);
}
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
@@ -292,8 +292,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 3717ceed4177..5d0cf4540364 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -313,9 +313,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
if (usemap_nid != nid) {
- printk(KERN_INFO
- "node %d must be removed before remove section %ld\n",
- nid, usemap_snr);
+ pr_info("node %d must be removed before remove section %ld\n",
+ nid, usemap_snr);
return;
}
/*
@@ -324,10 +323,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
* gather other removable sections for dynamic partitioning.
* Just notify un-removable section's number here.
*/
- printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
- pgdat_snr, nid);
- printk(KERN_CONT
- " have a circular dependency on usemap and pgdat allocations\n");
+ pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
+ usemap_snr, pgdat_snr, nid);
}
#else
static unsigned long * __init
@@ -355,7 +352,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data,
usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
size * usemap_count);
if (!usemap) {
- printk(KERN_WARNING "%s: allocation failed\n", __func__);
+ pr_warn("%s: allocation failed\n", __func__);
return;
}
@@ -428,8 +425,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
}
}
@@ -456,8 +453,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
if (map)
return map;
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
return NULL;
}
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index b5f7f24b8dd1..310ac0b8f974 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -174,9 +174,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
return 0;
nomem:
- printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
- printk(KERN_INFO
- "swap_cgroup can be disabled by swapaccount=0 boot option\n");
+ pr_info("couldn't allocate enough memory for swap_cgroup\n");
+ pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n");
return -ENOMEM;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d2c37365e2d6..b86cf26a586b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2526,8 +2526,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
- pr_info("Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
+ pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 806b0c758c5b..9f3a0290b273 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -230,8 +230,7 @@ retry:
break;
}
if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
- dst_addr))) {
+ unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
err = -ENOMEM;
break;
}
diff --git a/mm/util.c b/mm/util.c
index 4fb14ca5a419..47a57e557614 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -396,6 +396,13 @@ int __page_mapcount(struct page *page)
}
EXPORT_SYMBOL_GPL(__page_mapcount);
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
+int sysctl_overcommit_ratio __read_mostly = 50;
+unsigned long sysctl_overcommit_kbytes __read_mostly;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
+
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -437,6 +444,123 @@ unsigned long vm_commit_limit(void)
return allowed;
}
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
+
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_read_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ *
+ * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
+ *
+ * Note this is a helper function intended to be used by LSMs which
+ * wish to use this logic.
+ */
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+{
+ long free, allowed, reserve;
+
+ VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
+ -(s64)vm_committed_as_batch * num_online_cpus(),
+ "memory commitment underflow");
+
+ vm_acct_memory(pages);
+
+ /*
+ * Sometimes we want to use more memory than we have
+ */
+ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+ return 0;
+
+ if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
+
+ free += get_nr_swap_pages();
+
+ /*
+ * Any slabs which are created with the
+ * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+ * which are reclaimable, under pressure. The dentry
+ * cache and most inode caches should fall into this
+ */
+ free += global_page_state(NR_SLAB_RECLAIMABLE);
+
+ /*
+ * Leave reserved pages. The pages are not for anonymous pages.
+ */
+ if (free <= totalreserve_pages)
+ goto error;
+ else
+ free -= totalreserve_pages;
+
+ /*
+ * Reserve some for root
+ */
+ if (!cap_sys_admin)
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ if (free > pages)
+ return 0;
+
+ goto error;
+ }
+
+ allowed = vm_commit_limit();
+ /*
+ * Reserve some for root
+ */
+ if (!cap_sys_admin)
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
+ }
+
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
+ return 0;
+error:
+ vm_unacct_memory(pages);
+
+ return -ENOMEM;
+}
+
/**
* get_cmdline() - copy the cmdline value to a buffer.
* @task: the task whose cmdline value to copy.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fb42a5bffe47..ae7d20b447ff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -469,8 +469,8 @@ overflow:
goto retry;
}
if (printk_ratelimit())
- pr_warn("vmap allocation for size %lu failed: "
- "use vmalloc=<size> to increase size.\n", size);
+ pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
+ size);
kfree(va);
return ERR_PTR(-EBUSY);
}
@@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va)
static void vmap_debug_free_range(unsigned long start, unsigned long end)
{
/*
- * Unmap page tables and force a TLB flush immediately if
- * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
- * bugs similarly to those in linear kernel virtual address
- * space after a page has been freed.
+ * Unmap page tables and force a TLB flush immediately if pagealloc
+ * debugging is enabled. This catches use after free bugs similarly to
+ * those in linear kernel virtual address space after a page has been
+ * freed.
*
- * All the lazy freeing logic is still retained, in order to
- * minimise intrusiveness of this debugging feature.
+ * All the lazy freeing logic is still retained, in order to minimise
+ * intrusiveness of this debugging feature.
*
- * This is going to be *slow* (linear kernel virtual address
- * debugging doesn't do a broadcast TLB flush so it is a lot
- * faster).
+ * This is going to be *slow* (linear kernel virtual address debugging
+ * doesn't do a broadcast TLB flush so it is a lot faster).
*/
-#ifdef CONFIG_DEBUG_PAGEALLOC
- vunmap_page_range(start, end);
- flush_tlb_kernel_range(start, end);
-#endif
+ if (debug_pagealloc_enabled()) {
+ vunmap_page_range(start, end);
+ flush_tlb_kernel_range(start, end);
+ }
}
/*
@@ -1086,7 +1085,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
- BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+ BUG_ON(!PAGE_ALIGNED(addr));
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dd984470248f..b934223eaa45 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -382,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
*
* @memcg specifies the memory cgroup to target. If it is not NULL,
* only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise all shrinkers
- * are called, and memcg aware shrinkers are supposed to scan the
- * global list then.
+ * objects from the memory cgroup specified. Otherwise, only unaware
+ * shrinkers are called.
*
* @nr_scanned and @nr_eligible form a ratio that indicate how much of
* the available objects should be scanned. Page reclaim for example
@@ -404,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
unsigned long freed = 0;
- if (memcg && !memcg_kmem_online(memcg))
+ if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
return 0;
if (nr_scanned == 0)
@@ -428,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
- if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+ /*
+ * If kernel memory accounting is disabled, we ignore
+ * SHRINKER_MEMCG_AWARE flag and call all shrinkers
+ * passing NULL for memcg.
+ */
+ if (memcg_kmem_enabled() &&
+ !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
continue;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
@@ -633,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
- if (!page_freeze_refs(page, 2))
+ if (!page_ref_freeze(page, 2))
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
- page_unfreeze_refs(page, 2);
+ page_ref_unfreeze(page, 2);
goto cannot_free;
}
@@ -699,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
- page_unfreeze_refs(page, 1);
+ page_ref_unfreeze(page, 1);
return 1;
}
return 0;
@@ -2968,18 +2973,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order,
- unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, bool highorder,
+ unsigned long balance_gap, int classzone_idx)
{
- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
- balance_gap, classzone_idx))
- return false;
+ unsigned long mark = high_wmark_pages(zone) + balance_gap;
- if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
- order, 0, classzone_idx) == COMPACT_SKIPPED)
- return false;
+ /*
+ * When checking from pgdat_balanced(), kswapd should stop and sleep
+ * when it reaches the high order-0 watermark and let kcompactd take
+ * over. Other callers such as wakeup_kswapd() want to determine the
+ * true high-order watermark.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
+ mark += (1UL << order);
+ order = 0;
+ }
- return true;
+ return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
}
/*
@@ -3029,7 +3039,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
continue;
}
- if (zone_balanced(zone, order, 0, i))
+ if (zone_balanced(zone, order, false, 0, i))
balanced_pages += zone->managed_pages;
else if (!order)
return false;
@@ -3083,10 +3093,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
*/
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
- struct scan_control *sc,
- unsigned long *nr_attempted)
+ struct scan_control *sc)
{
- int testorder = sc->order;
unsigned long balance_gap;
bool lowmem_pressure;
@@ -3094,17 +3102,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
/*
- * Kswapd reclaims only single pages with compaction enabled. Trying
- * too hard to reclaim until contiguous free pages have become
- * available can hurt performance by evicting too much useful data
- * from memory. Do not reclaim more than needed for compaction.
- */
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
- compaction_suitable(zone, sc->order, 0, classzone_idx)
- != COMPACT_SKIPPED)
- testorder = 0;
-
- /*
* We put equal pressure on every zone, unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
@@ -3118,15 +3115,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
* reclaim is necessary
*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
- if (!lowmem_pressure && zone_balanced(zone, testorder,
+ if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
balance_gap, classzone_idx))
return true;
shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
- /* Account for the number of pages attempted to reclaim */
- *nr_attempted += sc->nr_to_reclaim;
-
clear_bit(ZONE_WRITEBACK, &zone->flags);
/*
@@ -3136,7 +3130,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* waits.
*/
if (zone_reclaimable(zone) &&
- zone_balanced(zone, testorder, 0, classzone_idx)) {
+ zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
@@ -3148,7 +3142,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the final order kswapd was reclaiming at
+ * Returns the highest zone idx kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -3165,8 +3159,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
- int *classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -3183,9 +3176,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
count_vm_event(PAGEOUTRUN);
do {
- unsigned long nr_attempted = 0;
bool raise_priority = true;
- bool pgdat_needs_compaction = (order > 0);
sc.nr_reclaimed = 0;
@@ -3220,7 +3211,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
break;
}
- if (!zone_balanced(zone, order, 0, 0)) {
+ if (!zone_balanced(zone, order, false, 0, 0)) {
end_zone = i;
break;
} else {
@@ -3236,24 +3227,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (i < 0)
goto out;
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- /*
- * If any zone is currently balanced then kswapd will
- * not call compaction as it is expected that the
- * necessary pages are already available.
- */
- if (pgdat_needs_compaction &&
- zone_watermark_ok(zone, order,
- low_wmark_pages(zone),
- *classzone_idx, 0))
- pgdat_needs_compaction = false;
- }
-
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
@@ -3297,8 +3270,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* that that high watermark would be met at 100%
* efficiency.
*/
- if (kswapd_shrink_zone(zone, end_zone,
- &sc, &nr_attempted))
+ if (kswapd_shrink_zone(zone, end_zone, &sc))
raise_priority = false;
}
@@ -3311,49 +3283,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
- /*
- * Fragmentation may mean that the system cannot be rebalanced
- * for high-order allocations in all zones. If twice the
- * allocation size has been reclaimed and the zones are still
- * not balanced then recheck the watermarks at order-0 to
- * prevent kswapd reclaiming excessively. Assume that a
- * process requested a high-order can direct reclaim/compact.
- */
- if (order && sc.nr_reclaimed >= 2UL << order)
- order = sc.order = 0;
-
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/*
- * Compact if necessary and kswapd is reclaiming at least the
- * high watermark number of pages as requsted
- */
- if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
- compact_pgdat(pgdat, order);
-
- /*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1 &&
- !pgdat_balanced(pgdat, order, *classzone_idx));
+ !pgdat_balanced(pgdat, order, classzone_idx));
out:
/*
- * Return the order we were reclaiming at so prepare_kswapd_sleep()
- * makes a decision on the order we were last reclaiming at. However,
- * if another caller entered the allocator slow path while kswapd
- * was awake, order will remain at the higher level
+ * Return the highest zone idx we were reclaiming at so
+ * prepare_kswapd_sleep() makes the same decisions as here.
*/
- *classzone_idx = end_zone;
- return order;
+ return end_zone;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
+ int classzone_idx, int balanced_classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3364,7 +3316,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3374,7 +3327,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3395,6 +3349,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
reset_isolation_suitable(pgdat);
+ /*
+ * We have freed the memory, now we should compact it to make
+ * allocation of the requested order possible.
+ */
+ wakeup_kcompactd(pgdat, order, classzone_idx);
+
if (!kthread_should_stop())
schedule();
@@ -3424,7 +3384,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
- unsigned balanced_order;
int classzone_idx, new_classzone_idx;
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
@@ -3457,24 +3416,19 @@ static int kswapd(void *p)
set_freezable();
order = new_order = 0;
- balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
bool ret;
/*
- * If the last balance_pgdat was unsuccessful it's unlikely a
- * new request of a similar or harder type will succeed soon
- * so consider going to sleep on the basis we reclaimed at
+ * While we were reclaiming, there might have been another
+ * wakeup, so check the values.
*/
- if (balanced_classzone_idx >= new_classzone_idx &&
- balanced_order == new_order) {
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = pgdat->nr_zones - 1;
- }
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
@@ -3484,7 +3438,7 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, balanced_order,
+ kswapd_try_to_sleep(pgdat, order, classzone_idx,
balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
@@ -3504,9 +3458,8 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balanced_classzone_idx = classzone_idx;
- balanced_order = balance_pgdat(pgdat, order,
- &balanced_classzone_idx);
+ balanced_classzone_idx = balance_pgdat(pgdat, order,
+ classzone_idx);
}
}
@@ -3536,7 +3489,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
}
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- if (zone_balanced(zone, order, 0, 0))
+ if (zone_balanced(zone, order, true, 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 69ce64f7b8d7..5e4300482897 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -826,6 +826,7 @@ const char * const vmstat_text[] = {
"compact_stall",
"compact_fail",
"compact_success",
+ "compact_daemon_wake",
#endif
#ifdef CONFIG_HUGETLB_PAGE
@@ -847,6 +848,7 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc_failed",
"thp_split_page",
"thp_split_page_failed",
+ "thp_deferred_split_page",
"thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
diff --git a/mm/workingset.c b/mm/workingset.c
index 6130ba0b2641..8a75f8d2916a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -349,7 +349,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable();
- pages = node_present_pages(sc->nid);
+ if (memcg_kmem_enabled())
+ pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL_FILE);
+ else
+ pages = node_page_state(sc->nid, NR_ACTIVE_FILE) +
+ node_page_state(sc->nid, NR_INACTIVE_FILE);
+
/*
* Active cache pages are limited to 50% of memory, and shadow
* entries that represent a refault distance bigger than that
@@ -458,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};
/*
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2d7c4c11fc63..e72efb109fde 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -281,7 +281,6 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
- bool huge;
};
static int create_handle_cache(struct zs_pool *pool)
@@ -495,6 +494,8 @@ static void __exit zs_stat_exit(void)
debugfs_remove_recursive(zs_stat_root);
}
+static unsigned long zs_can_compact(struct size_class *class);
+
static int zs_stats_size_show(struct seq_file *s, void *v)
{
int i;
@@ -502,14 +503,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
struct size_class *class;
int objs_per_zspage;
unsigned long class_almost_full, class_almost_empty;
- unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long obj_allocated, obj_used, pages_used, freeable;
unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+ unsigned long total_freeable = 0;
- seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
"class", "size", "almost_full", "almost_empty",
"obj_allocated", "obj_used", "pages_used",
- "pages_per_zspage");
+ "pages_per_zspage", "freeable");
for (i = 0; i < zs_size_classes; i++) {
class = pool->size_class[i];
@@ -522,6 +524,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
obj_used = zs_stat_get(class, OBJ_USED);
+ freeable = zs_can_compact(class);
spin_unlock(&class->lock);
objs_per_zspage = get_maxobj_per_zspage(class->size,
@@ -529,23 +532,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage;
- seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu"
+ " %10lu %10lu %16d %8lu\n",
i, class->size, class_almost_full, class_almost_empty,
obj_allocated, obj_used, pages_used,
- class->pages_per_zspage);
+ class->pages_per_zspage, freeable);
total_class_almost_full += class_almost_full;
total_class_almost_empty += class_almost_empty;
total_objs += obj_allocated;
total_used_objs += obj_used;
total_pages += pages_used;
+ total_freeable += freeable;
}
seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
"Total", "", total_class_almost_full,
total_class_almost_empty, total_objs,
- total_used_objs, total_pages);
+ total_used_objs, total_pages, "", total_freeable);
return 0;
}
@@ -1127,11 +1132,9 @@ static void __zs_unmap_object(struct mapping_area *area,
goto out;
buf = area->vm_buf;
- if (!area->huge) {
- buf = buf + ZS_HANDLE_SIZE;
- size -= ZS_HANDLE_SIZE;
- off += ZS_HANDLE_SIZE;
- }
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c1c8bc93412..67e7efe12ff7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1903,7 +1903,7 @@ EXPORT_SYMBOL(sock_cmsg_send);
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
if (pfrag->page) {
- if (atomic_read(&pfrag->page->_count) == 1) {
+ if (page_ref_count(pfrag->page) == 1) {
pfrag->offset = 0;
return true;
}
diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c
index 27e25bb78c97..72e2d0012084 100644
--- a/sound/drivers/pcsp/pcsp.c
+++ b/sound/drivers/pcsp/pcsp.c
@@ -14,6 +14,7 @@
#include <linux/input.h>
#include <linux/delay.h>
#include <linux/bitops.h>
+#include <linux/mm.h>
#include "pcsp_input.h"
#include "pcsp.h"
@@ -148,11 +149,11 @@ static int alsa_card_pcsp_init(struct device *dev)
return err;
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
/* Well, CONFIG_DEBUG_PAGEALLOC makes the sound horrible. Lets alert */
- printk(KERN_WARNING "PCSP: CONFIG_DEBUG_PAGEALLOC is enabled, "
- "which may make the sound noisy.\n");
-#endif
+ if (debug_pagealloc_enabled()) {
+ printk(KERN_WARNING "PCSP: CONFIG_DEBUG_PAGEALLOC is enabled, "
+ "which may make the sound noisy.\n");
+ }
return 0;
}
diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore
new file mode 100644
index 000000000000..11d888ca6a92
--- /dev/null
+++ b/tools/testing/radix-tree/.gitignore
@@ -0,0 +1,2 @@
+main
+radix-tree.c
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
new file mode 100644
index 000000000000..604212db9d4b
--- /dev/null
+++ b/tools/testing/radix-tree/Makefile
@@ -0,0 +1,19 @@
+
+CFLAGS += -I. -g -Wall -D_LGPL_SOURCE
+LDFLAGS += -lpthread -lurcu
+TARGETS = main
+OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \
+ regression1.o regression2.o regression3.o
+
+targets: $(TARGETS)
+
+main: $(OFILES)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(OFILES) -o main
+
+clean:
+ $(RM) -f $(TARGETS) *.o radix-tree.c
+
+$(OFILES): *.h */*.h
+
+radix-tree.c: ../../../lib/radix-tree.c
+ sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
diff --git a/tools/testing/radix-tree/find_next_bit.c b/tools/testing/radix-tree/find_next_bit.c
new file mode 100644
index 000000000000..d1c2178bb2d4
--- /dev/null
+++ b/tools/testing/radix-tree/find_next_bit.c
@@ -0,0 +1,57 @@
+/* find_next_bit.c: fallback find next bit implementation
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ const unsigned long *p = addr + BITOP_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG-1);
+ unsigned long tmp;
+
+ if (offset >= size)
+ return size;
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp &= (~0UL << offset);
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (tmp)
+ goto found_middle;
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size & ~(BITS_PER_LONG-1)) {
+ if ((tmp = *(p++)))
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+
+found_first:
+ tmp &= (~0UL >> (BITS_PER_LONG - size));
+ if (tmp == 0UL) /* Are any bits set? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + __ffs(tmp);
+}
diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c
new file mode 100644
index 000000000000..154823737b20
--- /dev/null
+++ b/tools/testing/radix-tree/linux.c
@@ -0,0 +1,60 @@
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <linux/mempool.h>
+#include <linux/slab.h>
+#include <urcu/uatomic.h>
+
+int nr_allocated;
+
+void *mempool_alloc(mempool_t *pool, int gfp_mask)
+{
+ return pool->alloc(gfp_mask, pool->data);
+}
+
+void mempool_free(void *element, mempool_t *pool)
+{
+ pool->free(element, pool->data);
+}
+
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data)
+{
+ mempool_t *ret = malloc(sizeof(*ret));
+
+ ret->alloc = alloc_fn;
+ ret->free = free_fn;
+ ret->data = pool_data;
+ return ret;
+}
+
+void *kmem_cache_alloc(struct kmem_cache *cachep, int flags)
+{
+ void *ret = malloc(cachep->size);
+ if (cachep->ctor)
+ cachep->ctor(ret);
+ uatomic_inc(&nr_allocated);
+ return ret;
+}
+
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+{
+ assert(objp);
+ uatomic_dec(&nr_allocated);
+ memset(objp, 0, cachep->size);
+ free(objp);
+}
+
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t offset,
+ unsigned long flags, void (*ctor)(void *))
+{
+ struct kmem_cache *ret = malloc(sizeof(*ret));
+
+ ret->size = size;
+ ret->ctor = ctor;
+ return ret;
+}
diff --git a/tools/testing/radix-tree/linux/bitops.h b/tools/testing/radix-tree/linux/bitops.h
new file mode 100644
index 000000000000..71d58427ab60
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops.h
@@ -0,0 +1,150 @@
+#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+
+#include <linux/types.h>
+
+#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p |= mask;
+}
+
+static inline void __clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p &= ~mask;
+}
+
+/**
+ * __change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __change_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p ^= mask;
+}
+
+/**
+ * __test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old | mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * __test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old & ~mask;
+ return (old & mask) != 0;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static inline int __test_and_change_bit(int nr,
+ volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old ^ mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static inline int test_bit(int nr, const volatile unsigned long *addr)
+{
+ return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ int num = 0;
+
+ if ((word & 0xffffffff) == 0) {
+ num += 32;
+ word >>= 32;
+ }
+ if ((word & 0xffff) == 0) {
+ num += 16;
+ word >>= 16;
+ }
+ if ((word & 0xff) == 0) {
+ num += 8;
+ word >>= 8;
+ }
+ if ((word & 0xf) == 0) {
+ num += 4;
+ word >>= 4;
+ }
+ if ((word & 0x3) == 0) {
+ num += 2;
+ word >>= 2;
+ }
+ if ((word & 0x1) == 0)
+ num += 1;
+ return num;
+}
+
+unsigned long find_next_bit(const unsigned long *addr,
+ unsigned long size,
+ unsigned long offset);
+
+#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/__ffs.h b/tools/testing/radix-tree/linux/bitops/__ffs.h
new file mode 100644
index 000000000000..9a3274aecf83
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/__ffs.h
@@ -0,0 +1,43 @@
+#ifndef _ASM_GENERIC_BITOPS___FFS_H_
+#define _ASM_GENERIC_BITOPS___FFS_H_
+
+#include <asm/types.h>
+
+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ int num = 0;
+
+#if BITS_PER_LONG == 64
+ if ((word & 0xffffffff) == 0) {
+ num += 32;
+ word >>= 32;
+ }
+#endif
+ if ((word & 0xffff) == 0) {
+ num += 16;
+ word >>= 16;
+ }
+ if ((word & 0xff) == 0) {
+ num += 8;
+ word >>= 8;
+ }
+ if ((word & 0xf) == 0) {
+ num += 4;
+ word >>= 4;
+ }
+ if ((word & 0x3) == 0) {
+ num += 2;
+ word >>= 2;
+ }
+ if ((word & 0x1) == 0)
+ num += 1;
+ return num;
+}
+
+#endif /* _ASM_GENERIC_BITOPS___FFS_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/ffs.h b/tools/testing/radix-tree/linux/bitops/ffs.h
new file mode 100644
index 000000000000..fbbb43af7dc0
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/ffs.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_GENERIC_BITOPS_FFS_H_
+#define _ASM_GENERIC_BITOPS_FFS_H_
+
+/**
+ * ffs - find first bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as
+ * the libc and compiler builtin ffs routines, therefore
+ * differs in spirit from the above ffz (man ffs).
+ */
+static inline int ffs(int x)
+{
+ int r = 1;
+
+ if (!x)
+ return 0;
+ if (!(x & 0xffff)) {
+ x >>= 16;
+ r += 16;
+ }
+ if (!(x & 0xff)) {
+ x >>= 8;
+ r += 8;
+ }
+ if (!(x & 0xf)) {
+ x >>= 4;
+ r += 4;
+ }
+ if (!(x & 3)) {
+ x >>= 2;
+ r += 2;
+ }
+ if (!(x & 1)) {
+ x >>= 1;
+ r += 1;
+ }
+ return r;
+}
+
+#endif /* _ASM_GENERIC_BITOPS_FFS_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/ffz.h b/tools/testing/radix-tree/linux/bitops/ffz.h
new file mode 100644
index 000000000000..6744bd4cdf46
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/ffz.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_GENERIC_BITOPS_FFZ_H_
+#define _ASM_GENERIC_BITOPS_FFZ_H_
+
+/*
+ * ffz - find first zero in word.
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+#define ffz(x) __ffs(~(x))
+
+#endif /* _ASM_GENERIC_BITOPS_FFZ_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/find.h b/tools/testing/radix-tree/linux/bitops/find.h
new file mode 100644
index 000000000000..72a51e5a12ef
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/find.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_GENERIC_BITOPS_FIND_H_
+#define _ASM_GENERIC_BITOPS_FIND_H_
+
+extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
+ size, unsigned long offset);
+
+extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned
+ long size, unsigned long offset);
+
+#define find_first_bit(addr, size) find_next_bit((addr), (size), 0)
+#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
+
+#endif /*_ASM_GENERIC_BITOPS_FIND_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/fls.h b/tools/testing/radix-tree/linux/bitops/fls.h
new file mode 100644
index 000000000000..850859bc5069
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/fls.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_GENERIC_BITOPS_FLS_H_
+#define _ASM_GENERIC_BITOPS_FLS_H_
+
+/**
+ * fls - find last (most-significant) bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as ffs.
+ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
+ */
+
+static inline int fls(int x)
+{
+ int r = 32;
+
+ if (!x)
+ return 0;
+ if (!(x & 0xffff0000u)) {
+ x <<= 16;
+ r -= 16;
+ }
+ if (!(x & 0xff000000u)) {
+ x <<= 8;
+ r -= 8;
+ }
+ if (!(x & 0xf0000000u)) {
+ x <<= 4;
+ r -= 4;
+ }
+ if (!(x & 0xc0000000u)) {
+ x <<= 2;
+ r -= 2;
+ }
+ if (!(x & 0x80000000u)) {
+ x <<= 1;
+ r -= 1;
+ }
+ return r;
+}
+
+#endif /* _ASM_GENERIC_BITOPS_FLS_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/fls64.h b/tools/testing/radix-tree/linux/bitops/fls64.h
new file mode 100644
index 000000000000..1b6b17ce2428
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/fls64.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_GENERIC_BITOPS_FLS64_H_
+#define _ASM_GENERIC_BITOPS_FLS64_H_
+
+#include <asm/types.h>
+
+static inline int fls64(__u64 x)
+{
+ __u32 h = x >> 32;
+ if (h)
+ return fls(h) + 32;
+ return fls(x);
+}
+
+#endif /* _ASM_GENERIC_BITOPS_FLS64_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/hweight.h b/tools/testing/radix-tree/linux/bitops/hweight.h
new file mode 100644
index 000000000000..fbbc383771da
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/hweight.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_GENERIC_BITOPS_HWEIGHT_H_
+#define _ASM_GENERIC_BITOPS_HWEIGHT_H_
+
+#include <asm/types.h>
+
+extern unsigned int hweight32(unsigned int w);
+extern unsigned int hweight16(unsigned int w);
+extern unsigned int hweight8(unsigned int w);
+extern unsigned long hweight64(__u64 w);
+
+#endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/le.h b/tools/testing/radix-tree/linux/bitops/le.h
new file mode 100644
index 000000000000..b9c7e5d2d2ad
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/le.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_GENERIC_BITOPS_LE_H_
+#define _ASM_GENERIC_BITOPS_LE_H_
+
+#include <asm/types.h>
+#include <asm/byteorder.h>
+
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+#define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7)
+
+#if defined(__LITTLE_ENDIAN)
+
+#define generic_test_le_bit(nr, addr) test_bit(nr, addr)
+#define generic___set_le_bit(nr, addr) __set_bit(nr, addr)
+#define generic___clear_le_bit(nr, addr) __clear_bit(nr, addr)
+
+#define generic_test_and_set_le_bit(nr, addr) test_and_set_bit(nr, addr)
+#define generic_test_and_clear_le_bit(nr, addr) test_and_clear_bit(nr, addr)
+
+#define generic___test_and_set_le_bit(nr, addr) __test_and_set_bit(nr, addr)
+#define generic___test_and_clear_le_bit(nr, addr) __test_and_clear_bit(nr, addr)
+
+#define generic_find_next_zero_le_bit(addr, size, offset) find_next_zero_bit(addr, size, offset)
+
+#elif defined(__BIG_ENDIAN)
+
+#define generic_test_le_bit(nr, addr) \
+ test_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic___set_le_bit(nr, addr) \
+ __set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic___clear_le_bit(nr, addr) \
+ __clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+
+#define generic_test_and_set_le_bit(nr, addr) \
+ test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic_test_and_clear_le_bit(nr, addr) \
+ test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+
+#define generic___test_and_set_le_bit(nr, addr) \
+ __test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic___test_and_clear_le_bit(nr, addr) \
+ __test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+
+extern unsigned long generic_find_next_zero_le_bit(const unsigned long *addr,
+ unsigned long size, unsigned long offset);
+
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+
+#define generic_find_first_zero_le_bit(addr, size) \
+ generic_find_next_zero_le_bit((addr), (size), 0)
+
+#endif /* _ASM_GENERIC_BITOPS_LE_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/non-atomic.h b/tools/testing/radix-tree/linux/bitops/non-atomic.h
new file mode 100644
index 000000000000..46a825cf2ae1
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bitops/non-atomic.h
@@ -0,0 +1,111 @@
+#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+
+#include <asm/types.h>
+
+#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p |= mask;
+}
+
+static inline void __clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p &= ~mask;
+}
+
+/**
+ * __change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __change_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ *p ^= mask;
+}
+
+/**
+ * __test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old | mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * __test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old & ~mask;
+ return (old & mask) != 0;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static inline int __test_and_change_bit(int nr,
+ volatile unsigned long *addr)
+{
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old ^ mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static inline int test_bit(int nr, const volatile unsigned long *addr)
+{
+ return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h
new file mode 100644
index 000000000000..ccbe444977df
--- /dev/null
+++ b/tools/testing/radix-tree/linux/bug.h
@@ -0,0 +1 @@
+#define WARN_ON_ONCE(x) assert(x)
diff --git a/tools/testing/radix-tree/linux/cpu.h b/tools/testing/radix-tree/linux/cpu.h
new file mode 100644
index 000000000000..60a40459f269
--- /dev/null
+++ b/tools/testing/radix-tree/linux/cpu.h
@@ -0,0 +1,34 @@
+
+#define hotcpu_notifier(a, b)
+
+#define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */
+#define CPU_UP_PREPARE 0x0003 /* CPU (unsigned)v coming up */
+#define CPU_UP_CANCELED 0x0004 /* CPU (unsigned)v NOT coming up */
+#define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */
+#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
+#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
+#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task,
+ * not handling interrupts, soon dead.
+ * Called on the dying cpu, interrupts
+ * are already disabled. Must not
+ * sleep, must not fail */
+#define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug
+ * lock is dropped */
+#define CPU_STARTING 0x000A /* CPU (unsigned)v soon running.
+ * Called on the new cpu, just before
+ * enabling interrupts. Must not sleep,
+ * must not fail */
+#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached
+ * idle loop. */
+#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly,
+ * perhaps due to preemption. */
+#define CPU_TASKS_FROZEN 0x0010
+
+#define CPU_ONLINE_FROZEN (CPU_ONLINE | CPU_TASKS_FROZEN)
+#define CPU_UP_PREPARE_FROZEN (CPU_UP_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_UP_CANCELED_FROZEN (CPU_UP_CANCELED | CPU_TASKS_FROZEN)
+#define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
+#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
+#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN)
+#define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN)
diff --git a/tools/testing/radix-tree/linux/export.h b/tools/testing/radix-tree/linux/export.h
new file mode 100644
index 000000000000..b6afd131998d
--- /dev/null
+++ b/tools/testing/radix-tree/linux/export.h
@@ -0,0 +1,2 @@
+
+#define EXPORT_SYMBOL(sym)
diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h
new file mode 100644
index 000000000000..0e37f7a760eb
--- /dev/null
+++ b/tools/testing/radix-tree/linux/gfp.h
@@ -0,0 +1,10 @@
+#ifndef _GFP_H
+#define _GFP_H
+
+#define __GFP_BITS_SHIFT 22
+#define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
+#define __GFP_WAIT 1
+#define __GFP_ACCOUNT 0
+#define __GFP_NOWARN 0
+
+#endif
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
new file mode 100644
index 000000000000..ae013b0160ac
--- /dev/null
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -0,0 +1,35 @@
+#ifndef _KERNEL_H
+#define _KERNEL_H
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <limits.h>
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define BUG_ON(expr) assert(!(expr))
+#define __init
+#define __must_check
+#define panic(expr)
+#define printk printf
+#define __force
+#define likely(c) (c)
+#define unlikely(c) (c)
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type, member) );})
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+static inline int in_interrupt(void)
+{
+ return 0;
+}
+#endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/kmemleak.h b/tools/testing/radix-tree/linux/kmemleak.h
new file mode 100644
index 000000000000..155f112786c4
--- /dev/null
+++ b/tools/testing/radix-tree/linux/kmemleak.h
@@ -0,0 +1 @@
+static inline void kmemleak_update_trace(const void *ptr) { }
diff --git a/tools/testing/radix-tree/linux/mempool.h b/tools/testing/radix-tree/linux/mempool.h
new file mode 100644
index 000000000000..6a2dc55b41d6
--- /dev/null
+++ b/tools/testing/radix-tree/linux/mempool.h
@@ -0,0 +1,16 @@
+
+#include <linux/slab.h>
+
+typedef void *(mempool_alloc_t)(int gfp_mask, void *pool_data);
+typedef void (mempool_free_t)(void *element, void *pool_data);
+
+typedef struct {
+ mempool_alloc_t *alloc;
+ mempool_free_t *free;
+ void *data;
+} mempool_t;
+
+void *mempool_alloc(mempool_t *pool, int gfp_mask);
+void mempool_free(void *element, mempool_t *pool);
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data);
diff --git a/tools/testing/radix-tree/linux/notifier.h b/tools/testing/radix-tree/linux/notifier.h
new file mode 100644
index 000000000000..70e4797d5a46
--- /dev/null
+++ b/tools/testing/radix-tree/linux/notifier.h
@@ -0,0 +1,8 @@
+#ifndef _NOTIFIER_H
+#define _NOTIFIER_H
+
+struct notifier_block;
+
+#define NOTIFY_OK 0x0001 /* Suits me */
+
+#endif
diff --git a/tools/testing/radix-tree/linux/percpu.h b/tools/testing/radix-tree/linux/percpu.h
new file mode 100644
index 000000000000..5837f1d56f17
--- /dev/null
+++ b/tools/testing/radix-tree/linux/percpu.h
@@ -0,0 +1,7 @@
+
+#define DEFINE_PER_CPU(type, val) type val
+
+#define __get_cpu_var(var) var
+#define this_cpu_ptr(var) var
+#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu))
diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/radix-tree/linux/preempt.h
new file mode 100644
index 000000000000..6210672e3baa
--- /dev/null
+++ b/tools/testing/radix-tree/linux/preempt.h
@@ -0,0 +1,4 @@
+/* */
+
+#define preempt_disable() do { } while (0)
+#define preempt_enable() do { } while (0)
diff --git a/tools/testing/radix-tree/linux/radix-tree.h b/tools/testing/radix-tree/linux/radix-tree.h
new file mode 100644
index 000000000000..ce694ddd4aea
--- /dev/null
+++ b/tools/testing/radix-tree/linux/radix-tree.h
@@ -0,0 +1 @@
+#include "../../../../include/linux/radix-tree.h"
diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h
new file mode 100644
index 000000000000..f7129ea2a899
--- /dev/null
+++ b/tools/testing/radix-tree/linux/rcupdate.h
@@ -0,0 +1,9 @@
+#ifndef _RCUPDATE_H
+#define _RCUPDATE_H
+
+#include <urcu.h>
+
+#define rcu_dereference_raw(p) rcu_dereference(p)
+#define rcu_dereference_protected(p, cond) rcu_dereference(p)
+
+#endif
diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h
new file mode 100644
index 000000000000..57282506c21d
--- /dev/null
+++ b/tools/testing/radix-tree/linux/slab.h
@@ -0,0 +1,28 @@
+#ifndef SLAB_H
+#define SLAB_H
+
+#include <linux/types.h>
+
+#define GFP_KERNEL 1
+#define SLAB_HWCACHE_ALIGN 1
+#define SLAB_PANIC 2
+#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
+
+static inline int gfpflags_allow_blocking(gfp_t mask)
+{
+ return 1;
+}
+
+struct kmem_cache {
+ int size;
+ void (*ctor)(void *);
+};
+
+void *kmem_cache_alloc(struct kmem_cache *cachep, int flags);
+void kmem_cache_free(struct kmem_cache *cachep, void *objp);
+
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t offset,
+ unsigned long flags, void (*ctor)(void *));
+
+#endif /* SLAB_H */
diff --git a/tools/testing/radix-tree/linux/types.h b/tools/testing/radix-tree/linux/types.h
new file mode 100644
index 000000000000..72a9d85f6c76
--- /dev/null
+++ b/tools/testing/radix-tree/linux/types.h
@@ -0,0 +1,28 @@
+#ifndef _TYPES_H
+#define _TYPES_H
+
+#define __rcu
+#define __read_mostly
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+
+typedef struct {
+ unsigned int x;
+} spinlock_t;
+
+#define uninitialized_var(x) x = x
+
+typedef unsigned gfp_t;
+#include <linux/gfp.h>
+
+#endif
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
new file mode 100644
index 000000000000..0e83cad27a9f
--- /dev/null
+++ b/tools/testing/radix-tree/main.c
@@ -0,0 +1,272 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+
+#include "test.h"
+#include "regression.h"
+
+void __gang_check(unsigned long middle, long down, long up, int chunk, int hop)
+{
+ long idx;
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ middle = 1 << 30;
+
+ for (idx = -down; idx < up; idx++)
+ item_insert(&tree, middle + idx);
+
+ item_check_absent(&tree, middle - down - 1);
+ for (idx = -down; idx < up; idx++)
+ item_check_present(&tree, middle + idx);
+ item_check_absent(&tree, middle + up);
+
+ item_gang_check_present(&tree, middle - down,
+ up + down, chunk, hop);
+ item_full_scan(&tree, middle - down, down + up, chunk);
+ item_kill_tree(&tree);
+}
+
+void gang_check(void)
+{
+ __gang_check(1 << 30, 128, 128, 35, 2);
+ __gang_check(1 << 31, 128, 128, 32, 32);
+ __gang_check(1 << 31, 128, 128, 32, 100);
+ __gang_check(1 << 31, 128, 128, 17, 7);
+ __gang_check(0xffff0000, 0, 65536, 17, 7);
+ __gang_check(0xfffffffe, 1, 1, 17, 7);
+}
+
+void __big_gang_check(void)
+{
+ unsigned long start;
+ int wrapped = 0;
+
+ start = 0;
+ do {
+ unsigned long old_start;
+
+// printf("0x%08lx\n", start);
+ __gang_check(start, rand() % 113 + 1, rand() % 71,
+ rand() % 157, rand() % 91 + 1);
+ old_start = start;
+ start += rand() % 1000000;
+ start %= 1ULL << 33;
+ if (start < old_start)
+ wrapped = 1;
+ } while (!wrapped);
+}
+
+void big_gang_check(void)
+{
+ int i;
+
+ for (i = 0; i < 1000; i++) {
+ __big_gang_check();
+ srand(time(0));
+ printf("%d ", i);
+ fflush(stdout);
+ }
+}
+
+void add_and_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ item_insert(&tree, 44);
+ item_check_present(&tree, 44);
+ item_check_absent(&tree, 43);
+ item_kill_tree(&tree);
+}
+
+void dynamic_height_check(void)
+{
+ int i;
+ RADIX_TREE(tree, GFP_KERNEL);
+ tree_verify_min_height(&tree, 0);
+
+ item_insert(&tree, 42);
+ tree_verify_min_height(&tree, 42);
+
+ item_insert(&tree, 1000000);
+ tree_verify_min_height(&tree, 1000000);
+
+ assert(item_delete(&tree, 1000000));
+ tree_verify_min_height(&tree, 42);
+
+ assert(item_delete(&tree, 42));
+ tree_verify_min_height(&tree, 0);
+
+ for (i = 0; i < 1000; i++) {
+ item_insert(&tree, i);
+ tree_verify_min_height(&tree, i);
+ }
+
+ i--;
+ for (;;) {
+ assert(item_delete(&tree, i));
+ if (i == 0) {
+ tree_verify_min_height(&tree, 0);
+ break;
+ }
+ i--;
+ tree_verify_min_height(&tree, i);
+ }
+
+ item_kill_tree(&tree);
+}
+
+void check_copied_tags(struct radix_tree_root *tree, unsigned long start, unsigned long end, unsigned long *idx, int count, int fromtag, int totag)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+/* if (i % 1000 == 0)
+ putchar('.'); */
+ if (idx[i] < start || idx[i] > end) {
+ if (item_tag_get(tree, idx[i], totag)) {
+ printf("%lu-%lu: %lu, tags %d-%d\n", start, end, idx[i], item_tag_get(tree, idx[i], fromtag), item_tag_get(tree, idx[i], totag));
+ }
+ assert(!item_tag_get(tree, idx[i], totag));
+ continue;
+ }
+ if (item_tag_get(tree, idx[i], fromtag) ^
+ item_tag_get(tree, idx[i], totag)) {
+ printf("%lu-%lu: %lu, tags %d-%d\n", start, end, idx[i], item_tag_get(tree, idx[i], fromtag), item_tag_get(tree, idx[i], totag));
+ }
+ assert(!(item_tag_get(tree, idx[i], fromtag) ^
+ item_tag_get(tree, idx[i], totag)));
+ }
+}
+
+#define ITEMS 50000
+
+void copy_tag_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+ unsigned long idx[ITEMS];
+ unsigned long start, end, count = 0, tagged, cur, tmp;
+ int i;
+
+// printf("generating radix tree indices...\n");
+ start = rand();
+ end = rand();
+ if (start > end && (rand() % 10)) {
+ cur = start;
+ start = end;
+ end = cur;
+ }
+ /* Specifically create items around the start and the end of the range
+ * with high probability to check for off by one errors */
+ cur = rand();
+ if (cur & 1) {
+ item_insert(&tree, start);
+ if (cur & 2) {
+ if (start <= end)
+ count++;
+ item_tag_set(&tree, start, 0);
+ }
+ }
+ if (cur & 4) {
+ item_insert(&tree, start-1);
+ if (cur & 8)
+ item_tag_set(&tree, start-1, 0);
+ }
+ if (cur & 16) {
+ item_insert(&tree, end);
+ if (cur & 32) {
+ if (start <= end)
+ count++;
+ item_tag_set(&tree, end, 0);
+ }
+ }
+ if (cur & 64) {
+ item_insert(&tree, end+1);
+ if (cur & 128)
+ item_tag_set(&tree, end+1, 0);
+ }
+
+ for (i = 0; i < ITEMS; i++) {
+ do {
+ idx[i] = rand();
+ } while (item_lookup(&tree, idx[i]));
+
+ item_insert(&tree, idx[i]);
+ if (rand() & 1) {
+ item_tag_set(&tree, idx[i], 0);
+ if (idx[i] >= start && idx[i] <= end)
+ count++;
+ }
+/* if (i % 1000 == 0)
+ putchar('.'); */
+ }
+
+// printf("\ncopying tags...\n");
+ cur = start;
+ tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, ITEMS, 0, 1);
+
+// printf("checking copied tags\n");
+ assert(tagged == count);
+ check_copied_tags(&tree, start, end, idx, ITEMS, 0, 1);
+
+ /* Copy tags in several rounds */
+// printf("\ncopying tags...\n");
+ cur = start;
+ do {
+ tmp = rand() % (count/10+2);
+ tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, tmp, 0, 2);
+ } while (tmp == tagged);
+
+// printf("%lu %lu %lu\n", tagged, tmp, count);
+// printf("checking copied tags\n");
+ check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2);
+ assert(tagged < tmp);
+ verify_tag_consistency(&tree, 0);
+ verify_tag_consistency(&tree, 1);
+ verify_tag_consistency(&tree, 2);
+// printf("\n");
+ item_kill_tree(&tree);
+}
+
+static void single_thread_tests(void)
+{
+ int i;
+
+ tag_check();
+ printf("after tag_check: %d allocated\n", nr_allocated);
+ gang_check();
+ printf("after gang_check: %d allocated\n", nr_allocated);
+ add_and_check();
+ printf("after add_and_check: %d allocated\n", nr_allocated);
+ dynamic_height_check();
+ printf("after dynamic_height_check: %d allocated\n", nr_allocated);
+ big_gang_check();
+ printf("after big_gang_check: %d allocated\n", nr_allocated);
+ for (i = 0; i < 2000; i++) {
+ copy_tag_check();
+ printf("%d ", i);
+ fflush(stdout);
+ }
+ printf("after copy_tag_check: %d allocated\n", nr_allocated);
+}
+
+int main(void)
+{
+ rcu_register_thread();
+ radix_tree_init();
+
+ regression1_test();
+ regression2_test();
+ regression3_test();
+ single_thread_tests();
+
+ sleep(1);
+ printf("after sleep(1): %d allocated\n", nr_allocated);
+ rcu_unregister_thread();
+
+ exit(0);
+}
diff --git a/tools/testing/radix-tree/rcupdate.c b/tools/testing/radix-tree/rcupdate.c
new file mode 100644
index 000000000000..31a2d14225d6
--- /dev/null
+++ b/tools/testing/radix-tree/rcupdate.c
@@ -0,0 +1,86 @@
+#include <linux/rcupdate.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <assert.h>
+
+static pthread_mutex_t rculock = PTHREAD_MUTEX_INITIALIZER;
+static struct rcu_head *rcuhead_global = NULL;
+static __thread int nr_rcuhead = 0;
+static __thread struct rcu_head *rcuhead = NULL;
+static __thread struct rcu_head *rcutail = NULL;
+
+static pthread_cond_t rcu_worker_cond = PTHREAD_COND_INITIALIZER;
+
+/* switch to urcu implementation when it is merged. */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head))
+{
+ head->func = func;
+ head->next = rcuhead;
+ rcuhead = head;
+ if (!rcutail)
+ rcutail = head;
+ nr_rcuhead++;
+ if (nr_rcuhead >= 1000) {
+ int signal = 0;
+
+ pthread_mutex_lock(&rculock);
+ if (!rcuhead_global)
+ signal = 1;
+ rcutail->next = rcuhead_global;
+ rcuhead_global = head;
+ pthread_mutex_unlock(&rculock);
+
+ nr_rcuhead = 0;
+ rcuhead = NULL;
+ rcutail = NULL;
+
+ if (signal) {
+ pthread_cond_signal(&rcu_worker_cond);
+ }
+ }
+}
+
+static void *rcu_worker(void *arg)
+{
+ struct rcu_head *r;
+
+ rcupdate_thread_init();
+
+ while (1) {
+ pthread_mutex_lock(&rculock);
+ while (!rcuhead_global) {
+ pthread_cond_wait(&rcu_worker_cond, &rculock);
+ }
+ r = rcuhead_global;
+ rcuhead_global = NULL;
+
+ pthread_mutex_unlock(&rculock);
+
+ synchronize_rcu();
+
+ while (r) {
+ struct rcu_head *tmp = r->next;
+ r->func(r);
+ r = tmp;
+ }
+ }
+
+ rcupdate_thread_exit();
+
+ return NULL;
+}
+
+static pthread_t worker_thread;
+void rcupdate_init(void)
+{
+ pthread_create(&worker_thread, NULL, rcu_worker, NULL);
+}
+
+void rcupdate_thread_init(void)
+{
+ rcu_register_thread();
+}
+void rcupdate_thread_exit(void)
+{
+ rcu_unregister_thread();
+}
diff --git a/tools/testing/radix-tree/regression.h b/tools/testing/radix-tree/regression.h
new file mode 100644
index 000000000000..e018c4816688
--- /dev/null
+++ b/tools/testing/radix-tree/regression.h
@@ -0,0 +1,8 @@
+#ifndef __REGRESSION_H__
+#define __REGRESSION_H__
+
+void regression1_test(void);
+void regression2_test(void);
+void regression3_test(void);
+
+#endif
diff --git a/tools/testing/radix-tree/regression1.c b/tools/testing/radix-tree/regression1.c
new file mode 100644
index 000000000000..2d03a63bb79c
--- /dev/null
+++ b/tools/testing/radix-tree/regression1.c
@@ -0,0 +1,220 @@
+/*
+ * Regression1
+ * Description:
+ * Salman Qazi describes the following radix-tree bug:
+ *
+ * In the following case, we get can get a deadlock:
+ *
+ * 0. The radix tree contains two items, one has the index 0.
+ * 1. The reader (in this case find_get_pages) takes the rcu_read_lock.
+ * 2. The reader acquires slot(s) for item(s) including the index 0 item.
+ * 3. The non-zero index item is deleted, and as a consequence the other item
+ * is moved to the root of the tree. The place where it used to be is queued
+ * for deletion after the readers finish.
+ * 3b. The zero item is deleted, removing it from the direct slot, it remains in
+ * the rcu-delayed indirect node.
+ * 4. The reader looks at the index 0 slot, and finds that the page has 0 ref
+ * count
+ * 5. The reader looks at it again, hoping that the item will either be freed
+ * or the ref count will increase. This never happens, as the slot it is
+ * looking at will never be updated. Also, this slot can never be reclaimed
+ * because the reader is holding rcu_read_lock and is in an infinite loop.
+ *
+ * The fix is to re-use the same "indirect" pointer case that requires a slot
+ * lookup retry into a general "retry the lookup" bit.
+ *
+ * Running:
+ * This test should run to completion in a few seconds. The above bug would
+ * cause it to hang indefinitely.
+ *
+ * Upstream commit:
+ * Not yet
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "regression.h"
+
+static RADIX_TREE(mt_tree, GFP_KERNEL);
+static pthread_mutex_t mt_lock;
+
+struct page {
+ pthread_mutex_t lock;
+ struct rcu_head rcu;
+ int count;
+ unsigned long index;
+};
+
+static struct page *page_alloc(void)
+{
+ struct page *p;
+ p = malloc(sizeof(struct page));
+ p->count = 1;
+ p->index = 1;
+ pthread_mutex_init(&p->lock, NULL);
+
+ return p;
+}
+
+static void page_rcu_free(struct rcu_head *rcu)
+{
+ struct page *p = container_of(rcu, struct page, rcu);
+ assert(!p->count);
+ pthread_mutex_destroy(&p->lock);
+ free(p);
+}
+
+static void page_free(struct page *p)
+{
+ call_rcu(&p->rcu, page_rcu_free);
+}
+
+static unsigned find_get_pages(unsigned long start,
+ unsigned int nr_pages, struct page **pages)
+{
+ unsigned int i;
+ unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_slot(&mt_tree,
+ (void ***)pages, NULL, start, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ assert((start | i) == 0);
+ goto restart;
+ }
+ /*
+ * No exceptional entries are inserted in this test.
+ */
+ assert(0);
+ }
+
+ pthread_mutex_lock(&page->lock);
+ if (!page->count) {
+ pthread_mutex_unlock(&page->lock);
+ goto repeat;
+ }
+ /* don't actually update page refcount */
+ pthread_mutex_unlock(&page->lock);
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ ret++;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static pthread_barrier_t worker_barrier;
+
+static void *regression1_fn(void *arg)
+{
+ rcu_register_thread();
+
+ if (pthread_barrier_wait(&worker_barrier) ==
+ PTHREAD_BARRIER_SERIAL_THREAD) {
+ int j;
+
+ for (j = 0; j < 1000000; j++) {
+ struct page *p;
+
+ p = page_alloc();
+ pthread_mutex_lock(&mt_lock);
+ radix_tree_insert(&mt_tree, 0, p);
+ pthread_mutex_unlock(&mt_lock);
+
+ p = page_alloc();
+ pthread_mutex_lock(&mt_lock);
+ radix_tree_insert(&mt_tree, 1, p);
+ pthread_mutex_unlock(&mt_lock);
+
+ pthread_mutex_lock(&mt_lock);
+ p = radix_tree_delete(&mt_tree, 1);
+ pthread_mutex_lock(&p->lock);
+ p->count--;
+ pthread_mutex_unlock(&p->lock);
+ pthread_mutex_unlock(&mt_lock);
+ page_free(p);
+
+ pthread_mutex_lock(&mt_lock);
+ p = radix_tree_delete(&mt_tree, 0);
+ pthread_mutex_lock(&p->lock);
+ p->count--;
+ pthread_mutex_unlock(&p->lock);
+ pthread_mutex_unlock(&mt_lock);
+ page_free(p);
+ }
+ } else {
+ int j;
+
+ for (j = 0; j < 100000000; j++) {
+ struct page *pages[10];
+
+ find_get_pages(0, 10, pages);
+ }
+ }
+
+ rcu_unregister_thread();
+
+ return NULL;
+}
+
+static pthread_t *threads;
+void regression1_test(void)
+{
+ int nr_threads;
+ int i;
+ long arg;
+
+ /* Regression #1 */
+ printf("running regression test 1, should finish in under a minute\n");
+ nr_threads = 2;
+ pthread_barrier_init(&worker_barrier, NULL, nr_threads);
+
+ threads = malloc(nr_threads * sizeof(pthread_t *));
+
+ for (i = 0; i < nr_threads; i++) {
+ arg = i;
+ if (pthread_create(&threads[i], NULL, regression1_fn, (void *)arg)) {
+ perror("pthread_create");
+ exit(1);
+ }
+ }
+
+ for (i = 0; i < nr_threads; i++) {
+ if (pthread_join(threads[i], NULL)) {
+ perror("pthread_join");
+ exit(1);
+ }
+ }
+
+ free(threads);
+
+ printf("regression test 1, done\n");
+}
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
new file mode 100644
index 000000000000..5d2fa28cdca3
--- /dev/null
+++ b/tools/testing/radix-tree/regression2.c
@@ -0,0 +1,126 @@
+/*
+ * Regression2
+ * Description:
+ * Toshiyuki Okajima describes the following radix-tree bug:
+ *
+ * In the following case, we can get a hangup on
+ * radix_radix_tree_gang_lookup_tag_slot.
+ *
+ * 0. The radix tree contains RADIX_TREE_MAP_SIZE items. And the tag of
+ * a certain item has PAGECACHE_TAG_DIRTY.
+ * 1. radix_tree_range_tag_if_tagged(, start, end, , PAGECACHE_TAG_DIRTY,
+ * PAGECACHE_TAG_TOWRITE) is called to add PAGECACHE_TAG_TOWRITE tag
+ * for the tag which has PAGECACHE_TAG_DIRTY. However, there is no tag with
+ * PAGECACHE_TAG_DIRTY within the range from start to end. As the result,
+ * There is no tag with PAGECACHE_TAG_TOWRITE but the root tag has
+ * PAGECACHE_TAG_TOWRITE.
+ * 2. An item is added into the radix tree and then the level of it is
+ * extended into 2 from 1. At that time, the new radix tree node succeeds
+ * the tag status of the root tag. Therefore the tag of the new radix tree
+ * node has PAGECACHE_TAG_TOWRITE but there is not slot with
+ * PAGECACHE_TAG_TOWRITE tag in the child node of the new radix tree node.
+ * 3. The tag of a certain item is cleared with PAGECACHE_TAG_DIRTY.
+ * 4. All items within the index range from 0 to RADIX_TREE_MAP_SIZE - 1 are
+ * released. (Only the item which index is RADIX_TREE_MAP_SIZE exist in the
+ * radix tree.) As the result, the slot of the radix tree node is NULL but
+ * the tag which corresponds to the slot has PAGECACHE_TAG_TOWRITE.
+ * 5. radix_tree_gang_lookup_tag_slot(PAGECACHE_TAG_TOWRITE) calls
+ * __lookup_tag. __lookup_tag returns with 0. And __lookup_tag doesn't
+ * change the index that is the input and output parameter. Because the 1st
+ * slot of the radix tree node is NULL, but the tag which corresponds to
+ * the slot has PAGECACHE_TAG_TOWRITE.
+ * Therefore radix_tree_gang_lookup_tag_slot tries to get some items by
+ * calling __lookup_tag, but it cannot get any items forever.
+ *
+ * The fix is to change that radix_tree_tag_if_tagged doesn't tag the root tag
+ * if it doesn't set any tags within the specified range.
+ *
+ * Running:
+ * This test should run to completion immediately. The above bug would cause it
+ * to hang indefinitely.
+ *
+ * Upstream commit:
+ * Not yet
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "regression.h"
+
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
+#else
+#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
+#endif
+
+#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
+#define PAGECACHE_TAG_DIRTY 0
+#define PAGECACHE_TAG_WRITEBACK 1
+#define PAGECACHE_TAG_TOWRITE 2
+
+static RADIX_TREE(mt_tree, GFP_KERNEL);
+unsigned long page_count = 0;
+
+struct page {
+ unsigned long index;
+};
+
+static struct page *page_alloc(void)
+{
+ struct page *p;
+ p = malloc(sizeof(struct page));
+ p->index = page_count++;
+
+ return p;
+}
+
+void regression2_test(void)
+{
+ int i;
+ struct page *p;
+ int max_slots = RADIX_TREE_MAP_SIZE;
+ unsigned long int start, end;
+ struct page *pages[1];
+
+ printf("running regression test 2 (should take milliseconds)\n");
+ /* 0. */
+ for (i = 0; i <= max_slots - 1; i++) {
+ p = page_alloc();
+ radix_tree_insert(&mt_tree, i, p);
+ }
+ radix_tree_tag_set(&mt_tree, max_slots - 1, PAGECACHE_TAG_DIRTY);
+
+ /* 1. */
+ start = 0;
+ end = max_slots - 2;
+ radix_tree_range_tag_if_tagged(&mt_tree, &start, end, 1,
+ PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+
+ /* 2. */
+ p = page_alloc();
+ radix_tree_insert(&mt_tree, max_slots, p);
+
+ /* 3. */
+ radix_tree_tag_clear(&mt_tree, max_slots - 1, PAGECACHE_TAG_DIRTY);
+
+ /* 4. */
+ for (i = max_slots - 1; i >= 0; i--)
+ radix_tree_delete(&mt_tree, i);
+
+ /* 5. */
+ // NOTE: start should not be 0 because radix_tree_gang_lookup_tag_slot
+ // can return.
+ start = 1;
+ end = max_slots - 2;
+ radix_tree_gang_lookup_tag_slot(&mt_tree, (void ***)pages, start, end,
+ PAGECACHE_TAG_TOWRITE);
+
+ /* We remove all the remained nodes */
+ radix_tree_delete(&mt_tree, max_slots);
+
+ printf("regression test 2, done\n");
+}
diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c
new file mode 100644
index 000000000000..1f06ed73d0a8
--- /dev/null
+++ b/tools/testing/radix-tree/regression3.c
@@ -0,0 +1,117 @@
+/*
+ * Regression3
+ * Description:
+ * Helper radix_tree_iter_retry resets next_index to the current index.
+ * In following radix_tree_next_slot current chunk size becomes zero.
+ * This isn't checked and it tries to dereference null pointer in slot.
+ *
+ * Helper radix_tree_iter_next reset slot to NULL and next_index to index + 1,
+ * for tagger iteraction it also must reset cached tags in iterator to abort
+ * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk.
+ *
+ * Running:
+ * This test should run to completion immediately. The above bug would
+ * cause it to segfault.
+ *
+ * Upstream commit:
+ * Not yet
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "regression.h"
+
+void regression3_test(void)
+{
+ RADIX_TREE(root, GFP_KERNEL);
+ void *ptr0 = (void *)4ul;
+ void *ptr = (void *)8ul;
+ struct radix_tree_iter iter;
+ void **slot;
+ bool first;
+
+ printf("running regression test 3 (should take milliseconds)\n");
+
+ radix_tree_insert(&root, 0, ptr0);
+ radix_tree_tag_set(&root, 0, 0);
+
+ first = true;
+ radix_tree_for_each_tagged(slot, &root, &iter, 0, 0) {
+ printf("tagged %ld %p\n", iter.index, *slot);
+ if (first) {
+ radix_tree_insert(&root, 1, ptr);
+ radix_tree_tag_set(&root, 1, 0);
+ first = false;
+ }
+ if (radix_tree_deref_retry(*slot)) {
+ printf("retry at %ld\n", iter.index);
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ }
+ radix_tree_delete(&root, 1);
+
+ first = true;
+ radix_tree_for_each_slot(slot, &root, &iter, 0) {
+ printf("slot %ld %p\n", iter.index, *slot);
+ if (first) {
+ radix_tree_insert(&root, 1, ptr);
+ first = false;
+ }
+ if (radix_tree_deref_retry(*slot)) {
+ printk("retry at %ld\n", iter.index);
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ }
+ radix_tree_delete(&root, 1);
+
+ first = true;
+ radix_tree_for_each_contig(slot, &root, &iter, 0) {
+ printk("contig %ld %p\n", iter.index, *slot);
+ if (first) {
+ radix_tree_insert(&root, 1, ptr);
+ first = false;
+ }
+ if (radix_tree_deref_retry(*slot)) {
+ printk("retry at %ld\n", iter.index);
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ }
+
+ radix_tree_for_each_slot(slot, &root, &iter, 0) {
+ printf("slot %ld %p\n", iter.index, *slot);
+ if (!iter.index) {
+ printf("next at %ld\n", iter.index);
+ slot = radix_tree_iter_next(&iter);
+ }
+ }
+
+ radix_tree_for_each_contig(slot, &root, &iter, 0) {
+ printf("contig %ld %p\n", iter.index, *slot);
+ if (!iter.index) {
+ printf("next at %ld\n", iter.index);
+ slot = radix_tree_iter_next(&iter);
+ }
+ }
+
+ radix_tree_tag_set(&root, 0, 0);
+ radix_tree_tag_set(&root, 1, 0);
+ radix_tree_for_each_tagged(slot, &root, &iter, 0, 0) {
+ printf("tagged %ld %p\n", iter.index, *slot);
+ if (!iter.index) {
+ printf("next at %ld\n", iter.index);
+ slot = radix_tree_iter_next(&iter);
+ }
+ }
+
+ radix_tree_delete(&root, 0);
+ radix_tree_delete(&root, 1);
+
+ printf("regression test 3 passed\n");
+}
diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c
new file mode 100644
index 000000000000..83136be552a0
--- /dev/null
+++ b/tools/testing/radix-tree/tag_check.c
@@ -0,0 +1,332 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+
+#include "test.h"
+
+
+static void
+__simple_checks(struct radix_tree_root *tree, unsigned long index, int tag)
+{
+ int ret;
+
+ item_check_absent(tree, index);
+ assert(item_tag_get(tree, index, tag) == 0);
+
+ item_insert(tree, index);
+ assert(item_tag_get(tree, index, tag) == 0);
+ item_tag_set(tree, index, tag);
+ ret = item_tag_get(tree, index, tag);
+ assert(ret != 0);
+ ret = item_delete(tree, index);
+ assert(ret != 0);
+ item_insert(tree, index);
+ ret = item_tag_get(tree, index, tag);
+ assert(ret == 0);
+ ret = item_delete(tree, index);
+ assert(ret != 0);
+ ret = item_delete(tree, index);
+ assert(ret == 0);
+}
+
+void simple_checks(void)
+{
+ unsigned long index;
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ for (index = 0; index < 10000; index++) {
+ __simple_checks(&tree, index, 0);
+ __simple_checks(&tree, index, 1);
+ }
+ verify_tag_consistency(&tree, 0);
+ verify_tag_consistency(&tree, 1);
+ printf("before item_kill_tree: %d allocated\n", nr_allocated);
+ item_kill_tree(&tree);
+ printf("after item_kill_tree: %d allocated\n", nr_allocated);
+}
+
+/*
+ * Check that tags propagate correctly when extending a tree.
+ */
+static void extend_checks(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ item_insert(&tree, 43);
+ assert(item_tag_get(&tree, 43, 0) == 0);
+ item_tag_set(&tree, 43, 0);
+ assert(item_tag_get(&tree, 43, 0) == 1);
+ item_insert(&tree, 1000000);
+ assert(item_tag_get(&tree, 43, 0) == 1);
+
+ item_insert(&tree, 0);
+ item_tag_set(&tree, 0, 0);
+ item_delete(&tree, 1000000);
+ assert(item_tag_get(&tree, 43, 0) != 0);
+ item_delete(&tree, 43);
+ assert(item_tag_get(&tree, 43, 0) == 0); /* crash */
+ assert(item_tag_get(&tree, 0, 0) == 1);
+
+ verify_tag_consistency(&tree, 0);
+
+ item_kill_tree(&tree);
+}
+
+/*
+ * Check that tags propagate correctly when contracting a tree.
+ */
+static void contract_checks(void)
+{
+ struct item *item;
+ int tmp;
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ tmp = 1<<RADIX_TREE_MAP_SHIFT;
+ item_insert(&tree, tmp);
+ item_insert(&tree, tmp+1);
+ item_tag_set(&tree, tmp, 0);
+ item_tag_set(&tree, tmp, 1);
+ item_tag_set(&tree, tmp+1, 0);
+ item_delete(&tree, tmp+1);
+ item_tag_clear(&tree, tmp, 1);
+
+ assert(radix_tree_gang_lookup_tag(&tree, (void **)&item, 0, 1, 0) == 1);
+ assert(radix_tree_gang_lookup_tag(&tree, (void **)&item, 0, 1, 1) == 0);
+
+ assert(item_tag_get(&tree, tmp, 0) == 1);
+ assert(item_tag_get(&tree, tmp, 1) == 0);
+
+ verify_tag_consistency(&tree, 0);
+ item_kill_tree(&tree);
+}
+
+/*
+ * Stupid tag thrasher
+ *
+ * Create a large linear array corresponding to the tree. Each element in
+ * the array is coherent with each node in the tree
+ */
+
+enum {
+ NODE_ABSENT = 0,
+ NODE_PRESENT = 1,
+ NODE_TAGGED = 2,
+};
+
+#define THRASH_SIZE 1000 * 1000
+#define N 127
+#define BATCH 33
+
+static void gang_check(struct radix_tree_root *tree,
+ char *thrash_state, int tag)
+{
+ struct item *items[BATCH];
+ int nr_found;
+ unsigned long index = 0;
+ unsigned long last_index = 0;
+
+ while ((nr_found = radix_tree_gang_lookup_tag(tree, (void **)items,
+ index, BATCH, tag))) {
+ int i;
+
+ for (i = 0; i < nr_found; i++) {
+ struct item *item = items[i];
+
+ while (last_index < item->index) {
+ assert(thrash_state[last_index] != NODE_TAGGED);
+ last_index++;
+ }
+ assert(thrash_state[last_index] == NODE_TAGGED);
+ last_index++;
+ }
+ index = items[nr_found - 1]->index + 1;
+ }
+}
+
+static void do_thrash(struct radix_tree_root *tree, char *thrash_state, int tag)
+{
+ int insert_chunk;
+ int delete_chunk;
+ int tag_chunk;
+ int untag_chunk;
+ int total_tagged = 0;
+ int total_present = 0;
+
+ for (insert_chunk = 1; insert_chunk < THRASH_SIZE; insert_chunk *= N)
+ for (delete_chunk = 1; delete_chunk < THRASH_SIZE; delete_chunk *= N)
+ for (tag_chunk = 1; tag_chunk < THRASH_SIZE; tag_chunk *= N)
+ for (untag_chunk = 1; untag_chunk < THRASH_SIZE; untag_chunk *= N) {
+ int i;
+ unsigned long index;
+ int nr_inserted = 0;
+ int nr_deleted = 0;
+ int nr_tagged = 0;
+ int nr_untagged = 0;
+ int actual_total_tagged;
+ int actual_total_present;
+
+ for (i = 0; i < insert_chunk; i++) {
+ index = rand() % THRASH_SIZE;
+ if (thrash_state[index] != NODE_ABSENT)
+ continue;
+ item_check_absent(tree, index);
+ item_insert(tree, index);
+ assert(thrash_state[index] != NODE_PRESENT);
+ thrash_state[index] = NODE_PRESENT;
+ nr_inserted++;
+ total_present++;
+ }
+
+ for (i = 0; i < delete_chunk; i++) {
+ index = rand() % THRASH_SIZE;
+ if (thrash_state[index] == NODE_ABSENT)
+ continue;
+ item_check_present(tree, index);
+ if (item_tag_get(tree, index, tag)) {
+ assert(thrash_state[index] == NODE_TAGGED);
+ total_tagged--;
+ } else {
+ assert(thrash_state[index] == NODE_PRESENT);
+ }
+ item_delete(tree, index);
+ assert(thrash_state[index] != NODE_ABSENT);
+ thrash_state[index] = NODE_ABSENT;
+ nr_deleted++;
+ total_present--;
+ }
+
+ for (i = 0; i < tag_chunk; i++) {
+ index = rand() % THRASH_SIZE;
+ if (thrash_state[index] != NODE_PRESENT) {
+ if (item_lookup(tree, index))
+ assert(item_tag_get(tree, index, tag));
+ continue;
+ }
+ item_tag_set(tree, index, tag);
+ item_tag_set(tree, index, tag);
+ assert(thrash_state[index] != NODE_TAGGED);
+ thrash_state[index] = NODE_TAGGED;
+ nr_tagged++;
+ total_tagged++;
+ }
+
+ for (i = 0; i < untag_chunk; i++) {
+ index = rand() % THRASH_SIZE;
+ if (thrash_state[index] != NODE_TAGGED)
+ continue;
+ item_check_present(tree, index);
+ assert(item_tag_get(tree, index, tag));
+ item_tag_clear(tree, index, tag);
+ item_tag_clear(tree, index, tag);
+ assert(thrash_state[index] != NODE_PRESENT);
+ thrash_state[index] = NODE_PRESENT;
+ nr_untagged++;
+ total_tagged--;
+ }
+
+ actual_total_tagged = 0;
+ actual_total_present = 0;
+ for (index = 0; index < THRASH_SIZE; index++) {
+ switch (thrash_state[index]) {
+ case NODE_ABSENT:
+ item_check_absent(tree, index);
+ break;
+ case NODE_PRESENT:
+ item_check_present(tree, index);
+ assert(!item_tag_get(tree, index, tag));
+ actual_total_present++;
+ break;
+ case NODE_TAGGED:
+ item_check_present(tree, index);
+ assert(item_tag_get(tree, index, tag));
+ actual_total_present++;
+ actual_total_tagged++;
+ break;
+ }
+ }
+
+ gang_check(tree, thrash_state, tag);
+
+ printf("%d(%d) %d(%d) %d(%d) %d(%d) / "
+ "%d(%d) present, %d(%d) tagged\n",
+ insert_chunk, nr_inserted,
+ delete_chunk, nr_deleted,
+ tag_chunk, nr_tagged,
+ untag_chunk, nr_untagged,
+ total_present, actual_total_present,
+ total_tagged, actual_total_tagged);
+ }
+}
+
+static void thrash_tags(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+ char *thrash_state;
+
+ thrash_state = malloc(THRASH_SIZE);
+ memset(thrash_state, 0, THRASH_SIZE);
+
+ do_thrash(&tree, thrash_state, 0);
+
+ verify_tag_consistency(&tree, 0);
+ item_kill_tree(&tree);
+ free(thrash_state);
+}
+
+static void leak_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ item_insert(&tree, 1000000);
+ item_delete(&tree, 1000000);
+ item_kill_tree(&tree);
+}
+
+static void __leak_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+ item_insert(&tree, 1000000);
+ printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+ item_delete(&tree, 1000000);
+ printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+ item_kill_tree(&tree);
+ printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+}
+
+static void single_check(void)
+{
+ struct item *items[BATCH];
+ RADIX_TREE(tree, GFP_KERNEL);
+ int ret;
+
+ item_insert(&tree, 0);
+ item_tag_set(&tree, 0, 0);
+ ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 0);
+ assert(ret == 1);
+ ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 1, BATCH, 0);
+ assert(ret == 0);
+ verify_tag_consistency(&tree, 0);
+ verify_tag_consistency(&tree, 1);
+ item_kill_tree(&tree);
+}
+
+void tag_check(void)
+{
+ single_check();
+ extend_checks();
+ contract_checks();
+ printf("after extend_checks: %d allocated\n", nr_allocated);
+ __leak_check();
+ leak_check();
+ printf("after leak_check: %d allocated\n", nr_allocated);
+ simple_checks();
+ printf("after simple_checks: %d allocated\n", nr_allocated);
+ thrash_tags();
+ printf("after thrash_tags: %d allocated\n", nr_allocated);
+}
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
new file mode 100644
index 000000000000..2bebf34cdc27
--- /dev/null
+++ b/tools/testing/radix-tree/test.c
@@ -0,0 +1,219 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+
+#include "test.h"
+
+struct item *
+item_tag_set(struct radix_tree_root *root, unsigned long index, int tag)
+{
+ return radix_tree_tag_set(root, index, tag);
+}
+
+struct item *
+item_tag_clear(struct radix_tree_root *root, unsigned long index, int tag)
+{
+ return radix_tree_tag_clear(root, index, tag);
+}
+
+int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag)
+{
+ return radix_tree_tag_get(root, index, tag);
+}
+
+int __item_insert(struct radix_tree_root *root, struct item *item)
+{
+ return radix_tree_insert(root, item->index, item);
+}
+
+int item_insert(struct radix_tree_root *root, unsigned long index)
+{
+ return __item_insert(root, item_create(index));
+}
+
+int item_delete(struct radix_tree_root *root, unsigned long index)
+{
+ struct item *item = radix_tree_delete(root, index);
+
+ if (item) {
+ assert(item->index == index);
+ free(item);
+ return 1;
+ }
+ return 0;
+}
+
+struct item *item_create(unsigned long index)
+{
+ struct item *ret = malloc(sizeof(*ret));
+
+ ret->index = index;
+ return ret;
+}
+
+void item_check_present(struct radix_tree_root *root, unsigned long index)
+{
+ struct item *item;
+
+ item = radix_tree_lookup(root, index);
+ assert(item != 0);
+ assert(item->index == index);
+}
+
+struct item *item_lookup(struct radix_tree_root *root, unsigned long index)
+{
+ return radix_tree_lookup(root, index);
+}
+
+void item_check_absent(struct radix_tree_root *root, unsigned long index)
+{
+ struct item *item;
+
+ item = radix_tree_lookup(root, index);
+ assert(item == 0);
+}
+
+/*
+ * Scan only the passed (start, start+nr] for present items
+ */
+void item_gang_check_present(struct radix_tree_root *root,
+ unsigned long start, unsigned long nr,
+ int chunk, int hop)
+{
+ struct item *items[chunk];
+ unsigned long into;
+
+ for (into = 0; into < nr; ) {
+ int nfound;
+ int nr_to_find = chunk;
+ int i;
+
+ if (nr_to_find > (nr - into))
+ nr_to_find = nr - into;
+
+ nfound = radix_tree_gang_lookup(root, (void **)items,
+ start + into, nr_to_find);
+ assert(nfound == nr_to_find);
+ for (i = 0; i < nfound; i++)
+ assert(items[i]->index == start + into + i);
+ into += hop;
+ }
+}
+
+/*
+ * Scan the entire tree, only expecting present items (start, start+nr]
+ */
+void item_full_scan(struct radix_tree_root *root, unsigned long start,
+ unsigned long nr, int chunk)
+{
+ struct item *items[chunk];
+ unsigned long into = 0;
+ unsigned long this_index = start;
+ int nfound;
+ int i;
+
+// printf("%s(0x%08lx, 0x%08lx, %d)\n", __FUNCTION__, start, nr, chunk);
+
+ while ((nfound = radix_tree_gang_lookup(root, (void **)items, into,
+ chunk))) {
+// printf("At 0x%08lx, nfound=%d\n", into, nfound);
+ for (i = 0; i < nfound; i++) {
+ assert(items[i]->index == this_index);
+ this_index++;
+ }
+// printf("Found 0x%08lx->0x%08lx\n",
+// items[0]->index, items[nfound-1]->index);
+ into = this_index;
+ }
+ if (chunk)
+ assert(this_index == start + nr);
+ nfound = radix_tree_gang_lookup(root, (void **)items,
+ this_index, chunk);
+ assert(nfound == 0);
+}
+
+static int verify_node(struct radix_tree_node *slot, unsigned int tag,
+ unsigned int height, int tagged)
+{
+ int anyset = 0;
+ int i;
+ int j;
+
+ slot = indirect_to_ptr(slot);
+
+ /* Verify consistency at this level */
+ for (i = 0; i < RADIX_TREE_TAG_LONGS; i++) {
+ if (slot->tags[tag][i]) {
+ anyset = 1;
+ break;
+ }
+ }
+ if (tagged != anyset) {
+ printf("tag: %u, height %u, tagged: %d, anyset: %d\n", tag, height, tagged, anyset);
+ for (j = 0; j < RADIX_TREE_MAX_TAGS; j++) {
+ printf("tag %d: ", j);
+ for (i = 0; i < RADIX_TREE_TAG_LONGS; i++)
+ printf("%016lx ", slot->tags[j][i]);
+ printf("\n");
+ }
+ return 1;
+ }
+ assert(tagged == anyset);
+
+ /* Go for next level */
+ if (height > 1) {
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
+ if (slot->slots[i])
+ if (verify_node(slot->slots[i], tag, height - 1,
+ !!test_bit(i, slot->tags[tag]))) {
+ printf("Failure at off %d\n", i);
+ for (j = 0; j < RADIX_TREE_MAX_TAGS; j++) {
+ printf("tag %d: ", j);
+ for (i = 0; i < RADIX_TREE_TAG_LONGS; i++)
+ printf("%016lx ", slot->tags[j][i]);
+ printf("\n");
+ }
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
+{
+ if (!root->height)
+ return;
+ verify_node(root->rnode, tag, root->height, !!root_tag_get(root, tag));
+}
+
+void item_kill_tree(struct radix_tree_root *root)
+{
+ struct item *items[32];
+ int nfound;
+
+ while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) {
+ int i;
+
+ for (i = 0; i < nfound; i++) {
+ void *ret;
+
+ ret = radix_tree_delete(root, items[i]->index);
+ assert(ret == items[i]);
+ free(items[i]);
+ }
+ }
+ assert(radix_tree_gang_lookup(root, (void **)items, 0, 32) == 0);
+ assert(root->rnode == NULL);
+}
+
+void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
+{
+ assert(radix_tree_maxindex(root->height) >= maxindex);
+ if (root->height > 1)
+ assert(radix_tree_maxindex(root->height-1) < maxindex);
+ else if (root->height == 1)
+ assert(radix_tree_maxindex(root->height-1) <= maxindex);
+}
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
new file mode 100644
index 000000000000..4e1d95faaa94
--- /dev/null
+++ b/tools/testing/radix-tree/test.h
@@ -0,0 +1,40 @@
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
+
+struct item {
+ unsigned long index;
+};
+
+struct item *item_create(unsigned long index);
+int __item_insert(struct radix_tree_root *root, struct item *item);
+int item_insert(struct radix_tree_root *root, unsigned long index);
+int item_delete(struct radix_tree_root *root, unsigned long index);
+struct item *item_lookup(struct radix_tree_root *root, unsigned long index);
+
+void item_check_present(struct radix_tree_root *root, unsigned long index);
+void item_check_absent(struct radix_tree_root *root, unsigned long index);
+void item_gang_check_present(struct radix_tree_root *root,
+ unsigned long start, unsigned long nr,
+ int chunk, int hop);
+void item_full_scan(struct radix_tree_root *root, unsigned long start,
+ unsigned long nr, int chunk);
+void item_kill_tree(struct radix_tree_root *root);
+
+void tag_check(void);
+
+struct item *
+item_tag_set(struct radix_tree_root *root, unsigned long index, int tag);
+struct item *
+item_tag_clear(struct radix_tree_root *root, unsigned long index, int tag);
+int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag);
+void tree_verify_min_height(struct radix_tree_root *root, int maxindex);
+void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
+
+extern int nr_allocated;
+
+/* Normally private parts of lib/radix-tree.c */
+void *indirect_to_ptr(void *ptr);
+int root_tag_get(struct radix_tree_root *root, unsigned int tag);
+unsigned long radix_tree_maxindex(unsigned int height);
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 5a6016224bb9..e92903fc7113 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -61,6 +61,8 @@
#define PM_PFRAME_BITS 55
#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+#define MAX_SWAPFILES_SHIFT 5
+#define PM_SWAP_OFFSET(x) (((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT)
#define PM_SOFT_DIRTY (1ULL << 55)
#define PM_MMAP_EXCLUSIVE (1ULL << 56)
#define PM_FILE (1ULL << 61)
@@ -73,6 +75,7 @@
#define KPF_BYTES 8
#define PROC_KPAGEFLAGS "/proc/kpageflags"
+#define PROC_KPAGECGROUP "/proc/kpagecgroup"
/* [32-] kernel hacking assistances */
#define KPF_RESERVED 32
@@ -92,7 +95,8 @@
#define KPF_SLOB_FREE 49
#define KPF_SLUB_FROZEN 50
#define KPF_SLUB_DEBUG 51
-#define KPF_FILE 62
+#define KPF_FILE 61
+#define KPF_SWAP 62
#define KPF_MMAP_EXCLUSIVE 63
#define KPF_ALL_BITS ((uint64_t)~0ULL)
@@ -146,6 +150,7 @@ static const char * const page_flag_names[] = {
[KPF_SLUB_DEBUG] = "E:slub_debug",
[KPF_FILE] = "F:file",
+ [KPF_SWAP] = "w:swap",
[KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive",
};
@@ -164,7 +169,9 @@ static int opt_raw; /* for kernel developers */
static int opt_list; /* list pages (in ranges) */
static int opt_no_summary; /* don't show summary */
static pid_t opt_pid; /* process to walk */
-const char * opt_file;
+const char * opt_file; /* file or directory path */
+static uint64_t opt_cgroup; /* cgroup inode */
+static int opt_list_cgroup;/* list page cgroup */
#define MAX_ADDR_RANGES 1024
static int nr_addr_ranges;
@@ -185,6 +192,7 @@ static int page_size;
static int pagemap_fd;
static int kpageflags_fd;
+static int kpagecgroup_fd = -1;
static int opt_hwpoison;
static int opt_unpoison;
@@ -278,6 +286,16 @@ static unsigned long kpageflags_read(uint64_t *buf,
return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages);
}
+static unsigned long kpagecgroup_read(uint64_t *buf,
+ unsigned long index,
+ unsigned long pages)
+{
+ if (kpagecgroup_fd < 0)
+ return pages;
+
+ return do_u64_read(kpagecgroup_fd, PROC_KPAGEFLAGS, buf, index, pages);
+}
+
static unsigned long pagemap_read(uint64_t *buf,
unsigned long index,
unsigned long pages)
@@ -297,6 +315,10 @@ static unsigned long pagemap_pfn(uint64_t val)
return pfn;
}
+static unsigned long pagemap_swap_offset(uint64_t val)
+{
+ return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0;
+}
/*
* page flag names
@@ -346,14 +368,15 @@ static char *page_flag_longname(uint64_t flags)
*/
static void show_page_range(unsigned long voffset, unsigned long offset,
- unsigned long size, uint64_t flags)
+ unsigned long size, uint64_t flags, uint64_t cgroup)
{
static uint64_t flags0;
+ static uint64_t cgroup0;
static unsigned long voff;
static unsigned long index;
static unsigned long count;
- if (flags == flags0 && offset == index + count &&
+ if (flags == flags0 && cgroup == cgroup0 && offset == index + count &&
size && voffset == voff + count) {
count += size;
return;
@@ -364,11 +387,14 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
printf("%lx\t", voff);
if (opt_file)
printf("%lu\t", voff);
+ if (opt_list_cgroup)
+ printf("@%llu\t", (unsigned long long)cgroup0);
printf("%lx\t%lx\t%s\n",
index, count, page_flag_name(flags0));
}
flags0 = flags;
+ cgroup0= cgroup;
index = offset;
voff = voffset;
count = size;
@@ -376,16 +402,18 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
static void flush_page_range(void)
{
- show_page_range(0, 0, 0, 0);
+ show_page_range(0, 0, 0, 0, 0);
}
-static void show_page(unsigned long voffset,
- unsigned long offset, uint64_t flags)
+static void show_page(unsigned long voffset, unsigned long offset,
+ uint64_t flags, uint64_t cgroup)
{
if (opt_pid)
printf("%lx\t", voffset);
if (opt_file)
printf("%lu\t", voffset);
+ if (opt_list_cgroup)
+ printf("@%llu\t", (unsigned long long)cgroup);
printf("%lx\t%s\n", offset, page_flag_name(flags));
}
@@ -452,6 +480,8 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
flags |= BIT(SOFTDIRTY);
if (pme & PM_FILE)
flags |= BIT(FILE);
+ if (pme & PM_SWAP)
+ flags |= BIT(SWAP);
if (pme & PM_MMAP_EXCLUSIVE)
flags |= BIT(MMAP_EXCLUSIVE);
@@ -566,23 +596,26 @@ static size_t hash_slot(uint64_t flags)
exit(EXIT_FAILURE);
}
-static void add_page(unsigned long voffset,
- unsigned long offset, uint64_t flags, uint64_t pme)
+static void add_page(unsigned long voffset, unsigned long offset,
+ uint64_t flags, uint64_t cgroup, uint64_t pme)
{
flags = kpageflags_flags(flags, pme);
if (!bit_mask_ok(flags))
return;
+ if (opt_cgroup && cgroup != (uint64_t)opt_cgroup)
+ return;
+
if (opt_hwpoison)
hwpoison_page(offset);
if (opt_unpoison)
unpoison_page(offset);
if (opt_list == 1)
- show_page_range(voffset, offset, 1, flags);
+ show_page_range(voffset, offset, 1, flags, cgroup);
else if (opt_list == 2)
- show_page(voffset, offset, flags);
+ show_page(voffset, offset, flags, cgroup);
nr_pages[hash_slot(flags)]++;
total_pages++;
@@ -595,24 +628,57 @@ static void walk_pfn(unsigned long voffset,
uint64_t pme)
{
uint64_t buf[KPAGEFLAGS_BATCH];
+ uint64_t cgi[KPAGEFLAGS_BATCH];
unsigned long batch;
unsigned long pages;
unsigned long i;
+ /*
+ * kpagecgroup_read() reads only if kpagecgroup were opened, but
+ * /proc/kpagecgroup might even not exist, so it's better to fill
+ * them with zeros here.
+ */
+ if (count == 1)
+ cgi[0] = 0;
+ else
+ memset(cgi, 0, sizeof cgi);
+
while (count) {
batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
pages = kpageflags_read(buf, index, batch);
if (pages == 0)
break;
+ if (kpagecgroup_read(cgi, index, pages) != pages)
+ fatal("kpagecgroup returned fewer pages than expected");
+
for (i = 0; i < pages; i++)
- add_page(voffset + i, index + i, buf[i], pme);
+ add_page(voffset + i, index + i, buf[i], cgi[i], pme);
index += pages;
count -= pages;
}
}
+static void walk_swap(unsigned long voffset, uint64_t pme)
+{
+ uint64_t flags = kpageflags_flags(0, pme);
+
+ if (!bit_mask_ok(flags))
+ return;
+
+ if (opt_cgroup)
+ return;
+
+ if (opt_list == 1)
+ show_page_range(voffset, pagemap_swap_offset(pme), 1, flags, 0);
+ else if (opt_list == 2)
+ show_page(voffset, pagemap_swap_offset(pme), flags, 0);
+
+ nr_pages[hash_slot(flags)]++;
+ total_pages++;
+}
+
#define PAGEMAP_BATCH (64 << 10)
static void walk_vma(unsigned long index, unsigned long count)
{
@@ -632,6 +698,8 @@ static void walk_vma(unsigned long index, unsigned long count)
pfn = pagemap_pfn(buf[i]);
if (pfn)
walk_pfn(index + i, pfn, 1, buf[i]);
+ if (buf[i] & PM_SWAP)
+ walk_swap(index + i, buf[i]);
}
index += pages;
@@ -713,10 +781,12 @@ static void usage(void)
" -d|--describe flags Describe flags\n"
" -a|--addr addr-spec Walk a range of pages\n"
" -b|--bits bits-spec Walk pages with specified bits\n"
+" -c|--cgroup path|@inode Walk pages within memory cgroup\n"
" -p|--pid pid Walk process address space\n"
" -f|--file filename Walk file address space\n"
" -l|--list Show page details in ranges\n"
" -L|--list-each Show page details one by one\n"
+" -C|--list-cgroup Show cgroup inode for pages\n"
" -N|--no-summary Don't show summary info\n"
" -X|--hwpoison hwpoison pages\n"
" -x|--unpoison unpoison pages\n"
@@ -851,6 +921,7 @@ static void walk_file(const char *name, const struct stat *st)
{
uint8_t vec[PAGEMAP_BATCH];
uint64_t buf[PAGEMAP_BATCH], flags;
+ uint64_t cgroup = 0;
unsigned long nr_pages, pfn, i;
off_t off, end = st->st_size;
int fd;
@@ -908,12 +979,15 @@ got_sigbus:
continue;
if (!kpageflags_read(&flags, pfn, 1))
continue;
+ if (!kpagecgroup_read(&cgroup, pfn, 1))
+ fatal("kpagecgroup_read failed");
if (first && opt_list) {
first = 0;
flush_page_range();
show_file(name, st);
}
- add_page(off / page_size + i, pfn, flags, buf[i]);
+ add_page(off / page_size + i, pfn,
+ flags, cgroup, buf[i]);
}
}
@@ -965,6 +1039,24 @@ static void parse_file(const char *name)
opt_file = name;
}
+static void parse_cgroup(const char *path)
+{
+ if (path[0] == '@') {
+ opt_cgroup = parse_number(path + 1);
+ return;
+ }
+
+ struct stat st;
+
+ if (stat(path, &st))
+ fatal("stat failed: %s: %m\n", path);
+
+ if (!S_ISDIR(st.st_mode))
+ fatal("cgroup supposed to be a directory: %s\n", path);
+
+ opt_cgroup = st.st_ino;
+}
+
static void parse_addr_range(const char *optarg)
{
unsigned long offset;
@@ -1088,9 +1180,11 @@ static const struct option opts[] = {
{ "file" , 1, NULL, 'f' },
{ "addr" , 1, NULL, 'a' },
{ "bits" , 1, NULL, 'b' },
+ { "cgroup" , 1, NULL, 'c' },
{ "describe" , 1, NULL, 'd' },
{ "list" , 0, NULL, 'l' },
{ "list-each" , 0, NULL, 'L' },
+ { "list-cgroup", 0, NULL, 'C' },
{ "no-summary", 0, NULL, 'N' },
{ "hwpoison" , 0, NULL, 'X' },
{ "unpoison" , 0, NULL, 'x' },
@@ -1105,7 +1199,7 @@ int main(int argc, char *argv[])
page_size = getpagesize();
while ((c = getopt_long(argc, argv,
- "rp:f:a:b:d:lLNXxh", opts, NULL)) != -1) {
+ "rp:f:a:b:d:c:ClLNXxh", opts, NULL)) != -1) {
switch (c) {
case 'r':
opt_raw = 1;
@@ -1122,6 +1216,12 @@ int main(int argc, char *argv[])
case 'b':
parse_bits_mask(optarg);
break;
+ case 'c':
+ parse_cgroup(optarg);
+ break;
+ case 'C':
+ opt_list_cgroup = 1;
+ break;
case 'd':
describe_flags(optarg);
exit(0);
@@ -1151,10 +1251,15 @@ int main(int argc, char *argv[])
}
}
+ if (opt_cgroup || opt_list_cgroup)
+ kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
+
if (opt_list && opt_pid)
printf("voffset\t");
if (opt_list && opt_file)
printf("foffset\t");
+ if (opt_list && opt_list_cgroup)
+ printf("cgroup\t");
if (opt_list == 1)
printf("offset\tlen\tflags\n");
if (opt_list == 2)