From d1836a3b2a9a50134ac449359b2fec56e3685e1d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 18 Jul 2023 13:28:11 +0800 Subject: mm/damon/core-test: initialise context before test in damon_test_set_attrs() Running kunit test for 6.5-rc1 hits one bug: ok 10 damon_test_update_monitoring_result general protection fault, probably for non-canonical address 0x1bffa5c419cfb81: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 110 Comm: kunit_try_catch Tainted: G N 6.5.0-rc2 #15 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:damon_set_attrs+0xb9/0x120 Code: f8 00 00 00 4c 8d 58 e0 48 39 c3 74 ba 41 ba 59 17 b7 d1 49 8b 43 10 4d 8d 4b 10 48 8d 70 e0 49 39 c1 74 50 49 8b 40 08 31 d2 <69> 4e 18 10 27 00 00 49 f7 30 31 d2 48 89 c5 89 c8 f7 f5 31 d2 89 RSP: 0000:ffffc900005bfd40 EFLAGS: 00010246 RAX: ffffffff81159fc0 RBX: ffffc900005bfeb8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 01bffa5c419cfb69 RDI: ffffc900005bfd70 RBP: ffffc90000013c10 R08: ffffc900005bfdc0 R09: ffffffff81ff10ed R10: 00000000d1b71759 R11: ffffffff81ff10dd R12: ffffc90000013a78 R13: ffff88810eb78180 R14: ffffffff818297c0 R15: ffffc90000013c28 FS: 0000000000000000(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000002a1c001 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: damon_test_set_attrs+0x63/0x1f0 kunit_generic_run_threadfn_adapter+0x17/0x30 kthread+0xfd/0x130 The problem seems to be related with the damon_ctx was used without being initialized. Fix it by adding the initialization. Link: https://lkml.kernel.org/r/20230718052811.1065173-1-feng.tang@intel.com Fixes: aa13779be6b7 ("mm/damon/core-test: add a test for damon_set_attrs()") Signed-off-by: Feng Tang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index c11210124344..bb07721909e1 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -320,25 +320,25 @@ static void damon_test_update_monitoring_result(struct kunit *test) static void damon_test_set_attrs(struct kunit *test) { - struct damon_ctx ctx; + struct damon_ctx *c = damon_new_ctx(); struct damon_attrs valid_attrs = { .min_nr_regions = 10, .max_nr_regions = 1000, .sample_interval = 5000, .aggr_interval = 100000,}; struct damon_attrs invalid_attrs; - KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &valid_attrs), 0); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &valid_attrs), 0); invalid_attrs = valid_attrs; invalid_attrs.min_nr_regions = 1; - KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); invalid_attrs = valid_attrs; invalid_attrs.max_nr_regions = 9; - KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); invalid_attrs = valid_attrs; invalid_attrs.aggr_interval = 4999; - KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); } static struct kunit_case damon_test_cases[] = { -- cgit v1.2.3 From dddfa05eb58076ad60f9a66e7155a5b3502b2dd5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Jul 2023 17:31:31 +0300 Subject: Revert "um: Use swap() to make code cleaner" This reverts commit 9b0da3f22307af693be80f5d3a89dc4c7f360a85. The sigio.c is clearly user space code which is handled by arch/um/scripts/Makefile.rules (see USER_OBJS rule). The above mentioned commit simply broke this agreement, we may not use Linux kernel internal headers in them without thorough thinking. Hence, revert the wrong commit. Link: https://lkml.kernel.org/r/20230724143131.30090-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202307212304.cH79zJp1-lkp@intel.com/ Cc: Anton Ivanov Cc: Herve Codina Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Rasmus Villemoes Cc: Richard Weinberger Cc: Yang Guang Cc: Signed-off-by: Andrew Morton --- arch/um/os-Linux/sigio.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 37d60e72cf26..9e71794839e8 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -3,7 +3,6 @@ * Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ -#include #include #include #include @@ -51,7 +50,7 @@ static struct pollfds all_sigio_fds; static int write_sigio_thread(void *unused) { - struct pollfds *fds; + struct pollfds *fds, tmp; struct pollfd *p; int i, n, respond_fd; char c; @@ -78,7 +77,9 @@ static int write_sigio_thread(void *unused) "write_sigio_thread : " "read on socket failed, " "err = %d\n", errno); - swap(current_poll, next_poll); + tmp = current_poll; + current_poll = next_poll; + next_poll = tmp; respond_fd = sigio_private[1]; } else { -- cgit v1.2.3 From 253e5df8b8f0145adb090f57c6f4e6efa52d738e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 23 Jul 2023 13:55:00 -0700 Subject: tmpfs: fix Documentation of noswap and huge mount options The noswap mount option is surely not one of the three options for sizing: move its description down. The huge= mount option does not accept numeric values: those are just in an internal enum. Delete those numbers, and follow the manpage text more closely (but there's not yet any fadvise() or fcntl() which applies here). /sys/kernel/mm/transparent_hugepage/shmem_enabled is hard to describe, and barely relevant to mounting a tmpfs: just refer to transhuge.rst (while still using the words deny and force, to help as informal reminders). [rdunlap@infradead.org: fixup Docs table for huge mount options] Link: https://lkml.kernel.org/r/20230725052333.26857-1-rdunlap@infradead.org Link: https://lkml.kernel.org/r/986cb0bf-9780-354-9bb-4bf57aadbab@google.com Signed-off-by: Hugh Dickins Signed-off-by: Randy Dunlap Fixes: d0f5a85442d1 ("shmem: update documentation") Fixes: 2c6efe9cf2d7 ("shmem: add support to ignore swap") Reviewed-by: Luis Chamberlain Cc: Christian Brauner Signed-off-by: Andrew Morton --- Documentation/filesystems/tmpfs.rst | 47 ++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst index f18f46be5c0c..2cd8fa332feb 100644 --- a/Documentation/filesystems/tmpfs.rst +++ b/Documentation/filesystems/tmpfs.rst @@ -84,8 +84,6 @@ nr_inodes The maximum number of inodes for this instance. The default is half of the number of your physical RAM pages, or (on a machine with highmem) the number of lowmem RAM pages, whichever is the lower. -noswap Disables swap. Remounts must respect the original settings. - By default swap is enabled. ========= ============================================================ These parameters accept a suffix k, m or g for kilo, mega and giga and @@ -99,36 +97,31 @@ mount with such options, since it allows any user with write access to use up all the memory on the machine; but enhances the scalability of that instance in a system with many CPUs making intensive use of it. +tmpfs blocks may be swapped out, when there is a shortage of memory. +tmpfs has a mount option to disable its use of swap: + +====== =========================================================== +noswap Disables swap. Remounts must respect the original settings. + By default swap is enabled. +====== =========================================================== + tmpfs also supports Transparent Huge Pages which requires a kernel configured with CONFIG_TRANSPARENT_HUGEPAGE and with huge supported for your system (has_transparent_hugepage(), which is architecture specific). The mount options for this are: -====== ============================================================ -huge=0 never: disables huge pages for the mount -huge=1 always: enables huge pages for the mount -huge=2 within_size: only allocate huge pages if the page will be - fully within i_size, also respect fadvise()/madvise() hints. -huge=3 advise: only allocate huge pages if requested with - fadvise()/madvise() -====== ============================================================ - -There is a sysfs file which you can also use to control system wide THP -configuration for all tmpfs mounts, the file is: - -/sys/kernel/mm/transparent_hugepage/shmem_enabled - -This sysfs file is placed on top of THP sysfs directory and so is registered -by THP code. It is however only used to control all tmpfs mounts with one -single knob. Since it controls all tmpfs mounts it should only be used either -for emergency or testing purposes. The values you can set for shmem_enabled are: - -== ============================================================ --1 deny: disables huge on shm_mnt and all mounts, for - emergency use --2 force: enables huge on shm_mnt and all mounts, w/o needing - option, for testing -== ============================================================ +================ ============================================================== +huge=never Do not allocate huge pages. This is the default. +huge=always Attempt to allocate huge page every time a new page is needed. +huge=within_size Only allocate huge page if it will be fully within i_size. + Also respect madvise(2) hints. +huge=advise Only allocate huge page if requested with madvise(2). +================ ============================================================== + +See also Documentation/admin-guide/mm/transhuge.rst, which describes the +sysfs file /sys/kernel/mm/transparent_hugepage/shmem_enabled: which can +be used to deny huge pages on all tmpfs mounts in an emergency, or to +force huge pages on all tmpfs mounts for testing. tmpfs has a mount option to set the NUMA memory allocation policy for all files in that instance (if CONFIG_NUMA is enabled) - which can be -- cgit v1.2.3 From fa598952fac059054316dccb2213478ccb81a0d1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 23 Jul 2023 14:05:54 -0700 Subject: shmem: minor fixes to splice-read implementation HWPoison: my reading of folio_test_hwpoison() is that it only tests the head page of a large folio, whereas splice_folio_into_pipe() will splice as much of the folio as it can: so for safety we should also check the has_hwpoisoned flag, set if any of the folio's pages are hwpoisoned. (Perhaps that ugliness can be improved at the mm end later.) The call to splice_zeropage_into_pipe() risked overrunning past EOF: ask it for "part" not "len". Link: https://lkml.kernel.org/r/32c72c9c-72a8-115f-407d-f0148f368@google.com Fixes: bd194b187115 ("shmem: Implement splice-read") Signed-off-by: Hugh Dickins Reviewed-by: David Howells Cc: David Hildenbrand Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/shmem.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2f2e0e618072..f5af4b943e42 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2796,7 +2796,8 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (*ppos >= i_size_read(inode)) break; - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ); + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, + SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -2805,7 +2806,9 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (folio) { folio_unlock(folio); - if (folio_test_hwpoison(folio)) { + if (folio_test_hwpoison(folio) || + (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio))) { error = -EIO; break; } @@ -2841,7 +2844,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, folio_put(folio); folio = NULL; } else { - n = splice_zeropage_into_pipe(pipe, *ppos, len); + n = splice_zeropage_into_pipe(pipe, *ppos, part); } if (!n) -- cgit v1.2.3 From 8b1cb4a2e819a2e4851ceeba53563f4edf03a8cf Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 23 Jul 2023 14:17:55 -0700 Subject: mm/pagewalk: fix EFI_PGT_DUMP of espfix area Booting x86_64 with CONFIG_EFI_PGT_DUMP=y shows messages of the form "mm/pgtable-generic.c:53: bad pmd (____ptrval____)(8000000100077061)". EFI_PGT_DUMP dumps all of efi_mm, including the espfix area, which is set up with pmd entries which fit the pmd_bad() check: so 0d940a9b270b warns and clears those entries, which would ruin running Win16 binaries. The failing pte_offset_map() stopped such a kernel from even booting, until a few commits later be872f83bf57 changed the pagewalk to tolerate that: but it needs to be even more careful, to not spoil those entries. I might have preferred to change init_espfix_ap() not to use "bad" pmd entries; or to leave them out of the efi_mm dump. But there is great value in staying away from there, and a pagewalk check of address against TASK_SIZE may protect from other such aberrations too. Link: https://lkml.kernel.org/r/22bca736-4cab-9ee5-6a52-73a3b2bbe865@google.com Closes: https://lore.kernel.org/linux-mm/CABXGCsN3JqXckWO=V7p=FhPU1tK03RE1w9UE6xL5Y86SMk209w@mail.gmail.com/ Fixes: 0d940a9b270b ("mm/pgtable: allow pte_offset_map[_lock]() to fail") Fixes: be872f83bf57 ("mm/pagewalk: walk_pte_range() allow for pte_offset_map()") Signed-off-by: Hugh Dickins Reported-by: Mikhail Gavrilov Tested-by: Mikhail Gavrilov Cc: Bagas Sanjaya Cc: Laura Abbott Signed-off-by: Andrew Morton --- mm/pagewalk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 64437105fe0d..2022333805d3 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -48,8 +48,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (walk->no_vma) { /* * pte_offset_map() might apply user-specific validation. + * Indeed, on x86_64 the pmd entries set up by init_espfix_ap() + * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear), + * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them. */ - if (walk->mm == &init_mm) + if (walk->mm == &init_mm || addr >= TASK_SIZE) pte = pte_offset_kernel(pmd, addr); else pte = pte_offset_map(pmd, addr); -- cgit v1.2.3 From 15571273db93ac2e4415e74280e04566c31d5eb0 Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Sat, 22 Jul 2023 18:03:30 -0700 Subject: scripts/spelling.txt: remove 'thead' as a typo T-Head is a vendor of processor core IP, and they have recently introduced the RISC-V TH1520 SoC. Remove 'thead' as a typo of 'thread' to avoid checkpatch incorrectly warning that 'thead' is typo in patches that add support for T-Head designs in the kernel. Link: https://lkml.kernel.org/r/20230723010329.674186-1-dfustini@baylibre.com Link: https://www.t-head.cn/ Signed-off-by: Drew Fustini Acked-by: Guo Ren Cc: Conor Dooley Cc: Jisheng Zhang Cc: Colin Ian King Cc: Diederik de Haas Cc: Ian Rogers Cc: Luca Ceresoli # versaclock5 Cc: Randy Dunlap Cc: SeongJae Park Signed-off-by: Andrew Morton --- scripts/spelling.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index fc7ba95e86a0..855c4863124b 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1541,7 +1541,6 @@ temeprature||temperature temorary||temporary temproarily||temporarily temperture||temperature -thead||thread theads||threads therfore||therefore thier||their -- cgit v1.2.3 From b1f02b95758d05b799731d939e76a0bd6da312db Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 22 Jul 2023 00:51:07 +0200 Subject: mm: fix memory ordering for mm_lock_seq and vm_lock_seq mm->mm_lock_seq effectively functions as a read/write lock; therefore it must be used with acquire/release semantics. A specific example is the interaction between userfaultfd_register() and lock_vma_under_rcu(). userfaultfd_register() does the following from the point where it changes a VMA's flags to the point where concurrent readers are permitted again (in a simple scenario where only a single private VMA is accessed and no merging/splitting is involved): userfaultfd_register userfaultfd_set_vm_flags vm_flags_reset vma_start_write down_write(&vma->vm_lock->lock) vma->vm_lock_seq = mm_lock_seq [marks VMA as busy] up_write(&vma->vm_lock->lock) vm_flags_init [sets VM_UFFD_* in __vm_flags] vma->vm_userfaultfd_ctx.ctx = ctx mmap_write_unlock vma_end_write_all WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1) [unlocks VMA] There are no memory barriers in between the __vm_flags update and the mm->mm_lock_seq update that unlocks the VMA, so the unlock can be reordered to above the `vm_flags_init()` call, which means from the perspective of a concurrent reader, a VMA can be marked as a userfaultfd VMA while it is not VMA-locked. That's bad, we definitely need a store-release for the unlock operation. The non-atomic write to vma->vm_lock_seq in vma_start_write() is mostly fine because all accesses to vma->vm_lock_seq that matter are always protected by the VMA lock. There is a racy read in vma_start_read() though that can tolerate false-positives, so we should be using WRITE_ONCE() to keep things tidy and data-race-free (including for KCSAN). On the other side, lock_vma_under_rcu() works as follows in the relevant region for locking and userfaultfd check: lock_vma_under_rcu vma_start_read vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [early bailout] down_read_trylock(&vma->vm_lock->lock) vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [main check] userfaultfd_armed checks vma->vm_flags & __VM_UFFD_FLAGS Here, the interesting aspect is how far down the mm->mm_lock_seq read can be reordered - if this read is reordered down below the vma->vm_flags access, this could cause lock_vma_under_rcu() to partly operate on information that was read while the VMA was supposed to be locked. To prevent this kind of downwards bleeding of the mm->mm_lock_seq read, we need to read it with a load-acquire. Some of the comment wording is based on suggestions by Suren. BACKPORT WARNING: One of the functions changed by this patch (which I've written against Linus' tree) is vma_try_start_write(), but this function no longer exists in mm/mm-everything. I don't know whether the merged version of this patch will be ordered before or after the patch that removes vma_try_start_write(). If you're backporting this patch to a tree with vma_try_start_write(), make sure this patch changes that function. Link: https://lkml.kernel.org/r/20230721225107.942336-1-jannh@google.com Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it") Signed-off-by: Jann Horn Reviewed-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 29 +++++++++++++++++++++++------ include/linux/mm_types.h | 28 ++++++++++++++++++++++++++++ include/linux/mmap_lock.h | 10 ++++++++-- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2dd73e4f3d8e..406ab9ea818f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -641,8 +641,14 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} */ static inline bool vma_start_read(struct vm_area_struct *vma) { - /* Check before locking. A race might cause false locked result. */ - if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -653,8 +659,13 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * False unlocked result is impossible because we modify and check * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -676,7 +687,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); + *mm_lock_seq = vma->vm_mm->mm_lock_seq; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -688,7 +699,13 @@ static inline void vma_start_write(struct vm_area_struct *vma) return; down_write(&vma->vm_lock->lock); - vma->vm_lock_seq = mm_lock_seq; + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); up_write(&vma->vm_lock->lock); } @@ -702,7 +719,7 @@ static inline bool vma_try_start_write(struct vm_area_struct *vma) if (!down_write_trylock(&vma->vm_lock->lock)) return false; - vma->vm_lock_seq = mm_lock_seq; + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); up_write(&vma->vm_lock->lock); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index de10fc797c8e..5e74ce4a28cd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -514,6 +514,20 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK + /* + * Can only be written (using WRITE_ONCE()) while holding both: + * - mmap_lock (in write mode) + * - vm_lock->lock (in write mode) + * Can be read reliably while holding one of: + * - mmap_lock (in read or write mode) + * - vm_lock->lock (in read or write mode) + * Can be read unreliably (using READ_ONCE()) for pessimistic bailout + * while holding nothing (except RCU to keep the VMA struct allocated). + * + * This sequence counter is explicitly allowed to overflow; sequence + * counter reuse can only lead to occasional unnecessary use of the + * slowpath. + */ int vm_lock_seq; struct vma_lock *vm_lock; @@ -679,6 +693,20 @@ struct mm_struct { * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK + /* + * This field has lock-like semantics, meaning it is sometimes + * accessed with ACQUIRE/RELEASE semantics. + * Roughly speaking, incrementing the sequence number is + * equivalent to releasing locks on VMAs; reading the sequence + * number can be part of taking a read lock on a VMA. + * + * Can be modified under write mmap_lock using RELEASE + * semantics. + * Can be read with no other protection when holding write + * mmap_lock. + * Can be read with ACQUIRE semantics if not holding write + * mmap_lock. + */ int mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index aab8f1b28d26..e05e167dbd16 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -76,8 +76,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); - /* No races during update due to exclusive mmap_lock being held */ - WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); + /* + * Nobody can concurrently modify mm->mm_lock_seq due to exclusive + * mmap_lock being held. + * We need RELEASE semantics here to ensure that preceding stores into + * the VMA take effect before we unlock it with this store. + * Pairs with ACQUIRE semantics in vma_start_read(). + */ + smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); } #else static inline void vma_end_write_all(struct mm_struct *mm) {} -- cgit v1.2.3 From d8ab9f7b644a2c9b64de405c1953c905ff219dc9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 21 Jul 2023 05:46:43 +0200 Subject: mm: lock VMA in dup_anon_vma() before setting ->anon_vma When VMAs are merged, dup_anon_vma() is called with `dst` pointing to the VMA that is being expanded to cover the area previously occupied by another VMA. This currently happens while `dst` is not write-locked. This means that, in the `src->anon_vma && !dst->anon_vma` case, as soon as the assignment `dst->anon_vma = src->anon_vma` has happened, concurrent page faults can happen on `dst` under the per-VMA lock. This is already icky in itself, since such page faults can now install pages into `dst` that are attached to an `anon_vma` that is not yet tied back to the `anon_vma` with an `anon_vma_chain`. But if `anon_vma_clone()` fails due to an out-of-memory error, things get much worse: `anon_vma_clone()` then reverts `dst->anon_vma` back to NULL, and `dst` remains completely unconnected to the `anon_vma`, even though we can have pages in the area covered by `dst` that point to the `anon_vma`. This means the `anon_vma` of such pages can be freed while the pages are still mapped into userspace, which leads to UAF when a helper like folio_lock_anon_vma_read() tries to look up the anon_vma of such a page. This theoretically is a security bug, but I believe it is really hard to actually trigger as an unprivileged user because it requires that you can make an order-0 GFP_KERNEL allocation fail, and the page allocator tries pretty hard to prevent that. I think doing the vma_start_write() call inside dup_anon_vma() is the most straightforward fix for now. For a kernel-assisted reproducer, see the notes section of the patch mail. Link: https://lkml.kernel.org/r/20230721034643.616851-1-jannh@google.com Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it") Signed-off-by: Jann Horn Reviewed-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmap.c b/mm/mmap.c index 3eda23c9ebe7..3937479d0e07 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -615,6 +615,7 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { + vma_start_write(dst); dst->anon_vma = src->anon_vma; return anon_vma_clone(dst, src); } -- cgit v1.2.3 From 286812b041ccc74b114cada5c10ada70a6277a20 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 20 Jul 2023 14:02:56 -0700 Subject: mailmap: update remaining active codeaurora.org email addresses The lack of mailmap updates for @codeaurora.org addresses reduces the usefulness of tools such as get_maintainer.pl. Some recent (and welcome!) additions has been made to improve the situation, this concludes the effort. Link: https://lkml.kernel.org/r/20230720210256.1296567-1-quic_bjorande@quicinc.com Signed-off-by: Bjorn Andersson Cc: Arnd Bergmann Cc: Konrad Dybcio Signed-off-by: Andrew Morton --- .mailmap | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index a33b9f56357c..5dd318121982 100644 --- a/.mailmap +++ b/.mailmap @@ -13,7 +13,9 @@ Aaron Durbin Abel Vesa Abel Vesa +Abhijeet Dharmapurikar Abhinav Kumar +Ahmad Masri Adam Oldham Adam Radford Adriana Reus @@ -30,6 +32,7 @@ Alexander Mikhalitsyn Alexandre Belloni Alexandre Ghiti +Alexei Avshalom Lazar Alexei Starovoitov Alexei Starovoitov Alexei Starovoitov @@ -37,8 +40,11 @@ Alex Hung Alex Shi Alex Shi Alex Shi +Aloka Dixit Al Viro Al Viro +Amit Blay +Amit Nischal Andi Kleen Andi Shyti Andreas Herrmann @@ -54,6 +60,8 @@ Andrey Ryabinin Andrzej Hajda André Almeida Andy Adamson +Anilkumar Kolli +Anirudh Ghayal Antoine Tenart Antoine Tenart Antonio Ospite @@ -62,9 +70,17 @@ Archit Taneja Ard Biesheuvel Arnaud Patard Arnd Bergmann +Arun Kumar Neelakantam +Ashok Raj Nagarajan +Ashwin Chaugule +Asutosh Das Atish Patra +Avaneesh Kumar Dwivedi Axel Dyks Axel Lin +Balakrishna Godavarthi +Banajit Goswami +Baochen Qiang Baolin Wang Baolin Wang Baolin Wang @@ -93,12 +109,15 @@ Brian Avery Brian King Brian Silverman Cai Huoqing +Can Guo +Carl Huang Changbin Du Changbin Du Chao Yu Chao Yu Chris Chiu Chris Chiu +Chris Lew Christian Borntraeger Christian Borntraeger Christian Borntraeger @@ -119,7 +138,10 @@ Daniel Borkmann Daniel Borkmann Daniel Borkmann David Brownell +David Collins David Woodhouse +Dedy Lansky +Deepak Kumar Singh Dengcheng Zhu Dengcheng Zhu Dengcheng Zhu @@ -136,6 +158,7 @@ Dmitry Safonov <0x7f454c46@gmail.com> Domen Puncer Douglas Gilbert Ed L. Cashin +Elliot Berman Enric Balletbo i Serra Enric Balletbo i Serra Erik Kaneda @@ -148,6 +171,7 @@ Faith Ekstrand Felipe W Damasio Felix Kuhling Felix Moeller +Fenglin Wu Filipe Lautert Finn Thain Franck Bui-Huu @@ -171,8 +195,11 @@ Greg Kurz Gregory CLEMENT Guilherme G. Piccoli Guilherme G. Piccoli +Gokul Sriram Palanisamy +Govindaraj Saminathan Guo Ren Guo Ren +Guru Das Srinagesh Gustavo Padovan Gustavo Padovan Hanjun Guo @@ -190,6 +217,7 @@ Huacai Chen J. Bruce Fields J. Bruce Fields Jacob Shin +Jack Pham Jaegeuk Kim Jaegeuk Kim Jaegeuk Kim @@ -217,10 +245,12 @@ Jayachandran C Jayachandran C Jean Tourrilhes +Jeevan Shriram Jeff Garzik Jeff Layton Jeff Layton Jeff Layton +Jeffrey Hugo Jens Axboe Jens Axboe Jens Axboe @@ -228,6 +258,7 @@ Jens Axboe Jens Osterkamp Jernej Skrabec Jessica Zhang +Jilai Wang Jiri Pirko Jiri Pirko Jiri Pirko @@ -238,6 +269,7 @@ Jiri Slaby Jiri Slaby Jisheng Zhang Jisheng Zhang +Jishnu Prakash Johan Hovold Johan Hovold John Crispin @@ -256,6 +288,7 @@ Jordan Crouse Josh Poimboeuf Josh Poimboeuf +Jouni Malinen Juha Yrjola Juha Yrjola Juha Yrjola @@ -263,6 +296,8 @@ Julien Thierry Iskren Chernev Kalle Valo Kalyan Thota +Karthikeyan Periyasamy +Kathiravan T Kay Sievers Kees Cook Kees Cook @@ -271,6 +306,8 @@ Kees Cook Keith Busch Keith Busch Kenneth W Chen +Kenneth Westfield +Kiran Gunda Kirill Tkhai Konstantin Khlebnikov Konstantin Khlebnikov @@ -279,6 +316,7 @@ Krishna Manikandan Krzysztof Kozlowski Krzysztof Kozlowski Krzysztof Kozlowski +Kshitiz Godara Kuninori Morimoto Kuogee Hsieh Lee Jones @@ -292,19 +330,27 @@ Leonid I Ananiev Leon Romanovsky Leon Romanovsky Leon Romanovsky +Liam Mark Linas Vepstas Linus Lüssing Linus Lüssing Li Yang Li Yang +Lior David Lorenzo Pieralisi Luca Ceresoli Lukasz Luba +Luo Jie Maciej W. Rozycki Maciej W. Rozycki +Maharaja Kennadyrajan +Maheshwar Ajja +Malathi Gottam +Manikanta Pubbisetty Manivannan Sadhasivam Manivannan Sadhasivam +Manoj Basapathi Marcin Nowakowski Marc Zyngier Marek Behún @@ -334,6 +380,7 @@ Matt Ranostay Matt Ranostay Matthew Ranostay Matt Ranostay Matt Redfearn +Maulik Shah Mauro Carvalho Chehab Mauro Carvalho Chehab Mauro Carvalho Chehab @@ -346,7 +393,10 @@ Maxim Mikityanskiy Maxime Ripard Maxime Ripard Maxime Ripard +Maya Erez Mayuresh Janorkar +Md Sadre Alam +Miaoqing Pan Michael Buesch Michal Simek Michel Dänzer @@ -357,6 +407,7 @@ Miguel Ojeda Mike Rapoport Mike Rapoport Mike Rapoport +Mike Tipton Miodrag Dinic Miquel Raynal Mitesh shah @@ -365,9 +416,13 @@ Morten Welinder Morten Welinder Morten Welinder Morten Welinder +Mukesh Ojha +Muna Sinada +Murali Nalajala Mythri P K Nadia Yvette Chambers William Lee Irwin III Nathan Chancellor +Neeraj Upadhyay Neil Armstrong Nguyen Anh Quynh Nicholas Piggin @@ -386,6 +441,7 @@ Nikolay Aleksandrov Nikolay Aleksandrov Nikolay Aleksandrov Nikolay Aleksandrov +Odelu Kukatla Oleksandr Natalenko Oleksij Rempel Oleksij Rempel @@ -393,6 +449,7 @@ Oleksij Rempel Oleksij Rempel Oleksij Rempel Oliver Upton +Oza Pawandeep Pali Rohár Paolo 'Blaisorblade' Giarrusso Patrick Mochel @@ -404,11 +461,14 @@ Paul E. McKenney Paul E. McKenney Paul Mackerras Paul Mackerras +Pavankumar Kondeti Peter A Jonsson Peter Oruba Peter Oruba Pratyush Anand Praveen BP +Pradeep Kumar Chitrapu +Prasad Sodagudi Punit Agrawal Qais Yousef Qais Yousef @@ -417,10 +477,16 @@ Quentin Perret Rafael J. Wysocki Rajeev Nandan Rajendra Nayak +Rajeshwari Ravindra Kamble +Raju P.L.S.S.S.N Rajesh Shah +Rakesh Pillai Ralf Baechle Ralf Wildenhues +Ram Chandra Jangir Randy Dunlap +Ravi Kumar Bokka +Ravi Kumar Siddojigari Rémi Denis-Courmont Ricardo Ribalda Ricardo Ribalda Ricardo Ribalda Delgado @@ -429,6 +495,7 @@ Richard Leitner Richard Leitner Richard Leitner Robert Foss +Rocky Liao Roman Gushchin Roman Gushchin Roman Gushchin @@ -446,24 +513,35 @@ Santosh Shilimkar Santosh Shilimkar Sarangdhar Joshi Sascha Hauer +Sahitya Tummala +Sathishkumar Muruganandam Satya Priya S.Çağlar Onur +Sayali Lokhande Sean Christopherson Sean Nyekjaer +Sean Tranchetti Sebastian Reichel Sebastian Reichel Sedat Dilek +Senthilkumar N L Seth Forshee Shannon Nelson Shannon Nelson Shannon Nelson +Sharath Chandra Vurukala Shiraz Hashim Shuah Khan Shuah Khan Shuah Khan Shuah Khan +Sibi Sankar +Sid Manning Simon Arlott Simon Kelley +Sricharan Ramabadhran +Srinivas Ramana +Sriram R Stéphane Witzmann Stephen Hemminger Stephen Hemminger @@ -471,22 +549,30 @@ Stephen Hemminger Stephen Hemminger Steve Wise Steve Wise -Subash Abhinov Kasiviswanathan +Subash Abhinov Kasiviswanathan +Subbaraman Narayanamurthy Subhash Jadavani +Sudarshan Rajagopalan Sudeep Holla Sudeep KarkadaNagesha Sumit Semwal +Surabhi Vishnoi Takashi YOSHII +Tamizh Chelvam Raja +Taniya Das Tejun Heo Thomas Graf Thomas Körper Thomas Pedersen Tiezhu Yang +Tingwei Zhang +Tirupathi Reddy Tobias Klauser Tobias Klauser Tobias Klauser Tobias Klauser Todor Tomov Tony Luck +Trilok Soni TripleX Chung TripleX Chung Tsuneo Yoshioka @@ -499,11 +585,17 @@ Uwe Kleine-König Uwe Kleine-König Uwe Kleine-König Valdis Kletnieks +Vara Reddy +Varadarajan Narayanan +Vasanthakumar Thiagarajan Vasily Averin Vasily Averin Vasily Averin Vasily Averin Valentin Schneider +Veera Sundaram Sankaran +Veerabhadrarao Badiganti +Venkateswara Naralasetty Vikash Garodia Vinod Koul Vinod Koul @@ -513,11 +605,14 @@ Viresh Kumar Viresh Kumar Viresh Kumar Viresh Kumar +Vivek Aknurwar Vivien Didelot Vlad Dogaru Vladimir Davydov Vladimir Davydov WeiXiong Liao +Wen Gong +Wesley Cheng Will Deacon Wolfram Sang Wolfram Sang -- cgit v1.2.3 From 641db40f3afe7998011bfabc726dba3e698f8196 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 25 Jul 2023 20:03:16 +0300 Subject: proc/vmcore: fix signedness bug in read_from_oldmem() The bug is the error handling: if (tmp < nr_bytes) { "tmp" can hold negative error codes but because "nr_bytes" is type size_t the negative error codes are treated as very high positive values (success). Fix this by changing "nr_bytes" to type ssize_t. The "nr_bytes" variable is used to store values between 1 and PAGE_SIZE and they can fit in ssize_t without any issue. Link: https://lkml.kernel.org/r/b55f7eed-1c65-4adc-95d1-6c7c65a54a6e@moroto.mountain Fixes: 5d8de293c224 ("vmcore: convert copy_oldmem_page() to take an iov_iter") Signed-off-by: Dan Carpenter Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cb80a7703d58..1fb213f379a5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -132,7 +132,7 @@ ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, u64 *ppos, bool encrypted) { unsigned long pfn, offset; - size_t nr_bytes; + ssize_t nr_bytes; ssize_t read = 0, tmp; int idx; -- cgit v1.2.3 From 6c54312f9689fbe27c70db5d42eebd29d04b672e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 17 Jul 2023 11:18:12 -0700 Subject: mm/memory-failure: fix hardware poison check in unpoison_memory() It was pointed out[1] that using folio_test_hwpoison() is wrong as we need to check the indiviual page that has poison. folio_test_hwpoison() only checks the head page so go back to using PageHWPoison(). User-visible effects include existing hwpoison-inject tests possibly failing as unpoisoning a single subpage could lead to unpoisoning an entire folio. Memory unpoisoning could also not work as expected as the function will break early due to only checking the head page and not the actually poisoned subpage. [1]: https://lore.kernel.org/lkml/ZLIbZygG7LqSI9xe@casper.infradead.org/ Link: https://lkml.kernel.org/r/20230717181812.167757-1-sidhartha.kumar@oracle.com Fixes: a6fddef49eef ("mm/memory-failure: convert unpoison_memory() to folios") Signed-off-by: Sidhartha Kumar Reported-by: Matthew Wilcox (Oracle) Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e245191e6b04..ece5d481b5ff 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2487,7 +2487,7 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (!folio_test_hwpoison(folio)) { + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; -- cgit v1.2.3