summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/damon/dbgfs.c7
-rw-r--r--mm/huge_memory.c20
-rw-r--r--mm/hugetlb.c41
-rw-r--r--mm/hugetlb_vmemmap.c1
-rw-r--r--mm/kmemleak.c61
-rw-r--r--mm/kmsan/instrumentation.c1
-rw-r--r--mm/kmsan/kmsan.h2
-rw-r--r--mm/kmsan/shadow.c1
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c12
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory-tiers.c8
-rw-r--r--mm/mempolicy.c17
-rw-r--r--mm/memremap.c1
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mmap.c27
-rw-r--r--mm/page_alloc.c21
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab_common.c24
-rw-r--r--mm/userfaultfd.c27
-rw-r--r--mm/zsmalloc.c3
22 files changed, 218 insertions, 89 deletions
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 6f0ae7d3ae39..b3f454a5c682 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -890,6 +890,7 @@ out:
static int dbgfs_rm_context(char *name)
{
struct dentry *root, *dir, **new_dirs;
+ struct inode *inode;
struct damon_ctx **new_ctxs;
int i, j;
int ret = 0;
@@ -905,6 +906,12 @@ static int dbgfs_rm_context(char *name)
if (!dir)
return -ENOENT;
+ inode = d_inode(dir);
+ if (!S_ISDIR(inode->i_mode)) {
+ ret = -EINVAL;
+ goto out_dput;
+ }
+
new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
GFP_KERNEL);
if (!new_dirs) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1cc4a5f4791e..811d19b5c4f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2206,9 +2206,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_wrprotect(entry);
if (!young)
entry = pte_mkold(entry);
- /* NOTE: this may set soft-dirty too on some archs */
- if (dirty)
- entry = pte_mkdirty(entry);
+ /*
+ * NOTE: we don't do pte_mkdirty when dirty==true
+ * because it breaks sparc64 which can sigsegv
+ * random process. Need to revisit when we figure
+ * out what is special with sparc64.
+ */
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
if (uffd_wp)
@@ -2455,7 +2458,16 @@ static void __split_huge_page_tail(struct page *head, int tail,
page_tail);
page_tail->mapping = head->mapping;
page_tail->index = head->index + tail;
- page_tail->private = 0;
+
+ /*
+ * page->private should not be set in tail pages with the exception
+ * of swap cache pages that store the swp_entry_t in tail pages.
+ * Fix up and warn once if private is unexpectedly set.
+ */
+ if (!folio_test_swapcache(page_folio(head))) {
+ VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
+ page_tail->private = 0;
+ }
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b586cdd75930..e48f8ef45b17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1014,15 +1014,23 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
/*
* Clear vm_private_data
+ * - For shared mappings this is a per-vma semaphore that may be
+ * allocated in a subsequent call to hugetlb_vm_op_open.
+ * Before clearing, make sure pointer is not associated with vma
+ * as this will leak the structure. This is the case when called
+ * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
+ * been called to allocate a new structure.
* - For MAP_PRIVATE mappings, this is the reserve map which does
* not apply to children. Faults generated by the children are
* not guaranteed to succeed, even if read-only.
- * - For shared mappings this is a per-vma semaphore that may be
- * allocated in a subsequent call to hugetlb_vm_op_open.
*/
- vma->vm_private_data = (void *)0;
- if (!(vma->vm_flags & VM_MAYSHARE))
- return;
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock && vma_lock->vma != vma)
+ vma->vm_private_data = NULL;
+ } else
+ vma->vm_private_data = NULL;
}
/*
@@ -2924,11 +2932,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
+ spin_lock_irq(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
- spin_lock_irq(&hugetlb_lock);
list_add(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
/* Fall through */
@@ -4601,6 +4609,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
struct resv_map *resv = vma_resv_map(vma);
/*
+ * HPAGE_RESV_OWNER indicates a private mapping.
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
@@ -4615,11 +4624,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
/*
* vma_lock structure for sharable mappings is vma specific.
- * Clear old pointer (if copied via vm_area_dup) and create new.
+ * Clear old pointer (if copied via vm_area_dup) and allocate
+ * new structure. Before clearing, make sure vma_lock is not
+ * for this vma.
*/
if (vma->vm_flags & VM_MAYSHARE) {
- vma->vm_private_data = NULL;
- hugetlb_vma_lock_alloc(vma);
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock) {
+ if (vma_lock->vma != vma) {
+ vma->vm_private_data = NULL;
+ hugetlb_vma_lock_alloc(vma);
+ } else
+ pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
+ } else
+ hugetlb_vma_lock_alloc(vma);
}
}
@@ -6092,6 +6111,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ret = -EIO;
+ if (PageHWPoison(page))
+ goto out_release_unlock;
+
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index ba2a2596fb4e..4962dd1ba4a6 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) "HugeTLB: " fmt
#include <linux/pgtable.h>
+#include <linux/moduleparam.h>
#include <linux/bootmem_info.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 37af2dc8dac9..646e2979641f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1461,6 +1461,27 @@ static void scan_gray_list(void)
}
/*
+ * Conditionally call resched() in a object iteration loop while making sure
+ * that the given object won't go away without RCU read lock by performing a
+ * get_object() if !pinned.
+ *
+ * Return: false if can't do a cond_resched() due to get_object() failure
+ * true otherwise
+ */
+static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned)
+{
+ if (!pinned && !get_object(object))
+ return false;
+
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+ if (!pinned)
+ put_object(object);
+ return true;
+}
+
+/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
@@ -1471,7 +1492,7 @@ static void kmemleak_scan(void)
struct zone *zone;
int __maybe_unused i;
int new_leaks = 0;
- int loop1_cnt = 0;
+ int loop_cnt = 0;
jiffies_last_scan = jiffies;
@@ -1480,7 +1501,6 @@ static void kmemleak_scan(void)
list_for_each_entry_rcu(object, &object_list, object_list) {
bool obj_pinned = false;
- loop1_cnt++;
raw_spin_lock_irq(&object->lock);
#ifdef DEBUG
/*
@@ -1514,24 +1534,11 @@ static void kmemleak_scan(void)
raw_spin_unlock_irq(&object->lock);
/*
- * Do a cond_resched() to avoid soft lockup every 64k objects.
- * Make sure a reference has been taken so that the object
- * won't go away without RCU read lock.
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
*/
- if (!(loop1_cnt & 0xffff)) {
- if (!obj_pinned && !get_object(object)) {
- /* Try the next object instead */
- loop1_cnt--;
- continue;
- }
-
- rcu_read_unlock();
- cond_resched();
- rcu_read_lock();
-
- if (!obj_pinned)
- put_object(object);
- }
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, obj_pinned))
+ loop_cnt--; /* Try again on next object */
}
rcu_read_unlock();
@@ -1598,8 +1605,16 @@ static void kmemleak_scan(void)
* scan and color them gray until the next scan.
*/
rcu_read_lock();
+ loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
/*
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
+ */
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, false))
+ loop_cnt--; /* Try again on next object */
+
+ /*
* This is racy but we can save the overhead of lock/unlock
* calls. The missed objects, if any, should be caught in
* the next scan.
@@ -1632,8 +1647,16 @@ static void kmemleak_scan(void)
* Scanning result reporting.
*/
rcu_read_lock();
+ loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
/*
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
+ */
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, false))
+ loop_cnt--; /* Try again on next object */
+
+ /*
* This is racy but we can save the overhead of lock/unlock
* calls. The missed objects, if any, should be caught in
* the next scan.
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index 280d15413268..271f135f97a1 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -14,6 +14,7 @@
#include "kmsan.h"
#include <linux/gfp.h>
+#include <linux/kmsan_string.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
index 7019c46d33a7..a14744205435 100644
--- a/mm/kmsan/kmsan.h
+++ b/mm/kmsan/kmsan.h
@@ -124,6 +124,8 @@ static __always_inline bool kmsan_in_runtime(void)
{
if ((hardirq_count() >> HARDIRQ_SHIFT) > 1)
return true;
+ if (in_nmi())
+ return true;
return kmsan_get_context()->kmsan_in_runtime;
}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 21e3e196ec3c..a787c04e9583 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -167,6 +167,7 @@ void kmsan_copy_page_meta(struct page *dst, struct page *src)
__memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE);
kmsan_leave_runtime();
}
+EXPORT_SYMBOL(kmsan_copy_page_meta);
void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags)
{
diff --git a/mm/maccess.c b/mm/maccess.c
index 5f4d240f67ec..074f6b086671 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -97,7 +97,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
return src - unsafe_addr;
Efault:
pagefault_enable();
- dst[-1] = '\0';
+ dst[0] = '\0';
return -EFAULT;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 2baa93ca2310..c7105ec6d08c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -813,7 +813,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
if (start & ~huge_page_mask(hstate_vma(vma)))
return false;
- *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+ /*
+ * Madvise callers expect the length to be rounded up to PAGE_SIZE
+ * boundaries, and may be unaware that this VMA uses huge pages.
+ * Avoid unexpected data loss by rounding down the number of
+ * huge pages freed.
+ */
+ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+
return true;
}
@@ -828,6 +835,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
return -EINVAL;
+ if (start == end)
+ return 0;
+
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..bead6bccc7f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index f116b7b6333e..fa8c9d07f9ce 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -131,8 +131,8 @@ static void memory_tier_device_release(struct device *dev)
kfree(tier);
}
-static ssize_t nodes_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t nodelist_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
int ret;
nodemask_t nmask;
@@ -143,10 +143,10 @@ static ssize_t nodes_show(struct device *dev,
mutex_unlock(&memory_tier_lock);
return ret;
}
-static DEVICE_ATTR_RO(nodes);
+static DEVICE_ATTR_RO(nodelist);
static struct attribute *memtier_dev_attrs[] = {
- &dev_attr_nodes.attr,
+ &dev_attr_nodelist.attr,
NULL
};
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a937eaec5b68..61aa9aedb728 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -787,17 +787,22 @@ static int vma_replace_policy(struct vm_area_struct *vma,
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
- MA_STATE(mas, &mm->mm_mt, start - 1, start - 1);
+ MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
- prev = mas_find_rev(&mas, 0);
- if (prev && (start < prev->vm_end))
- vma = prev;
- else
- vma = mas_next(&mas, end - 1);
+ prev = mas_prev(&mas, 0);
+ if (unlikely(!prev))
+ mas_set(&mas, start);
+
+ vma = mas_find(&mas, end - 1);
+ if (WARN_ON(!vma))
+ return 0;
+
+ if (start > vma->vm_start)
+ prev = vma;
for (; vma; vma = mas_next(&mas, end - 1)) {
unsigned long vmstart = max(start, vma->vm_start);
diff --git a/mm/memremap.c b/mm/memremap.c
index 421bec3a29ee..08cbf54fe037 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -335,6 +335,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
WARN(1, "File system DAX not supported\n");
return ERR_PTR(-EINVAL);
}
+ params.pgprot = pgprot_decrypted(params.pgprot);
break;
case MEMORY_DEVICE_GENERIC:
break;
diff --git a/mm/migrate.c b/mm/migrate.c
index 1379e1912772..dff333593a8a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1582,6 +1582,13 @@ out:
*/
list_splice(&ret_pages, from);
+ /*
+ * Return 0 in case all subpages of fail-to-migrate THPs are
+ * migrated successfully.
+ */
+ if (list_empty(from))
+ rc = 0;
+
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
diff --git a/mm/mmap.c b/mm/mmap.c
index bf2122af94e7..c3c5c1d6103d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -618,7 +618,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end);
+ struct vm_area_struct *next_next = NULL; /* uninit var warning */
+ struct vm_area_struct *next = find_vma(mm, vma->vm_end);
struct vm_area_struct *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root_cached *root = NULL;
@@ -2625,14 +2626,14 @@ cannot_expand:
if (error)
goto unmap_and_free_vma;
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
+ /*
+ * Expansion is handled above, merging is handled below.
+ * Drivers should not alter the address of the VMA.
*/
- WARN_ON_ONCE(addr != vma->vm_start);
-
- addr = vma->vm_start;
+ if (WARN_ON((addr != vma->vm_start))) {
+ error = -EINVAL;
+ goto close_and_free_vma;
+ }
mas_reset(&mas);
/*
@@ -2654,7 +2655,6 @@ cannot_expand:
vm_area_free(vma);
vma = merge;
/* Update vm_flags to pick up the change. */
- addr = vma->vm_start;
vm_flags = vma->vm_flags;
goto unmap_writable;
}
@@ -2674,6 +2674,8 @@ cannot_expand:
error = -EINVAL;
if (file)
goto close_and_free_vma;
+ else if (vma->vm_file)
+ goto unmap_and_free_vma;
else
goto free_vma;
}
@@ -2681,6 +2683,8 @@ cannot_expand:
if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
error = -ENOMEM;
if (file)
+ goto close_and_free_vma;
+ else if (vma->vm_file)
goto unmap_and_free_vma;
else
goto free_vma;
@@ -2751,7 +2755,7 @@ unmap_and_free_vma:
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end);
- if (vm_flags & VM_SHARED)
+ if (file && (vm_flags & VM_SHARED))
mapping_unmap_writable(file->f_mapping);
free_vma:
vm_area_free(vma);
@@ -2852,6 +2856,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (next->vm_flags != vma->vm_flags)
goto out;
+ if (start + size <= next->vm_end)
+ break;
+
prev = next;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e20ade858e71..218b28ee49ed 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -807,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx)
p->mapping = TAIL_MAPPING;
set_compound_head(p, head);
+ set_page_private(p, 0);
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -5784,14 +5785,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
if (addr) {
- unsigned long alloc_end = addr + (PAGE_SIZE << order);
- unsigned long used = addr + PAGE_ALIGN(size);
-
- split_page(virt_to_page((void *)addr), order);
- while (used < alloc_end) {
- free_page(used);
- used += PAGE_SIZE;
- }
+ unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
+ struct page *page = virt_to_page((void *)addr);
+ struct page *last = page + nr;
+
+ split_page_owner(page, 1 << order);
+ split_page_memcg(page, 1 << order);
+ while (page < --last)
+ set_page_refcounted(last);
+
+ last = page + (1UL << order);
+ for (page += nr; page < last; page++)
+ __free_pages_ok(page, 0, FPI_TO_TAIL);
}
return (void *)addr;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 04141a9bea70..47fbc1696466 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -330,7 +330,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
zone->zone_start_pfn);
if (skip_isolation) {
- int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+ int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
VM_BUG_ON(!is_migrate_isolate(mt));
} else {
diff --git a/mm/shmem.c b/mm/shmem.c
index 8280a5cb48df..c1d8b8a1aa3b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2424,9 +2424,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (!zeropage) { /* COPY */
page_kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
PAGE_SIZE);
+ pagefault_enable();
kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 33b1886b06eb..0042fb2730d1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -941,7 +941,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
ret = __kmalloc_large_node(size, flags, node);
- trace_kmalloc(_RET_IP_, ret, size,
+ trace_kmalloc(caller, ret, size,
PAGE_SIZE << get_order(size), flags, node);
return ret;
}
@@ -953,7 +953,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller
ret = __kmem_cache_alloc_node(s, flags, node, size, caller);
ret = kasan_kmalloc(s, ret, size, flags);
- trace_kmalloc(_RET_IP_, ret, size, s->size, flags, node);
+ trace_kmalloc(caller, ret, size, s->size, flags, node);
return ret;
}
@@ -1010,7 +1010,7 @@ EXPORT_SYMBOL(kfree);
/**
* __ksize -- Report full size of underlying allocation
- * @objp: pointer to the object
+ * @object: pointer to the object
*
* This should only be used internally to query the true size of allocations.
* It is not meant to be a way to discover the usable size of an allocation
@@ -1018,7 +1018,7 @@ EXPORT_SYMBOL(kfree);
* the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
* and/or FORTIFY_SOURCE.
*
- * Return: size of the actual memory used by @objp in bytes
+ * Return: size of the actual memory used by @object in bytes
*/
size_t __ksize(const void *object)
{
@@ -1040,7 +1040,6 @@ size_t __ksize(const void *object)
return slab_ksize(folio_slab(folio)->slab_cache);
}
-#ifdef CONFIG_TRACING
void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE,
@@ -1064,7 +1063,6 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
return ret;
}
EXPORT_SYMBOL(kmalloc_node_trace);
-#endif /* !CONFIG_TRACING */
#endif /* !CONFIG_SLOB */
gfp_t kmalloc_fix_flags(gfp_t flags)
@@ -1411,20 +1409,6 @@ void kfree_sensitive(const void *p)
}
EXPORT_SYMBOL(kfree_sensitive);
-/**
- * ksize - get the actual amount of memory allocated for a given object
- * @objp: Pointer to the object
- *
- * kmalloc may internally round up allocations and return more memory
- * than requested. ksize() can be used to determine the actual amount of
- * memory allocated. The caller may use this additional memory, even though
- * a smaller amount of memory was initially specified with the kmalloc call.
- * The caller must guarantee that objp points to a valid object previously
- * allocated with either kmalloc() or kmem_cache_alloc(). The object
- * must not be freed during the duration of the call.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
size_t ksize(const void *objp)
{
size_t size;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e24e8a47ce8a..650ab6cfd5f4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -64,7 +64,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
pte_t _dst_pte, *dst_pte;
bool writable = dst_vma->vm_flags & VM_WRITE;
bool vm_shared = dst_vma->vm_flags & VM_SHARED;
- bool page_in_cache = page->mapping;
+ bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
struct inode *inode;
pgoff_t offset, max_off;
@@ -157,11 +157,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!page)
goto out;
- page_kaddr = kmap_atomic(page);
+ page_kaddr = kmap_local_page(page);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ pagefault_enable();
+ kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
@@ -646,11 +663,11 @@ retry:
mmap_read_unlock(dst_mm);
BUG_ON(!page);
- page_kaddr = kmap(page);
+ page_kaddr = kmap_local_page(page);
err = copy_from_user(page_kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap(page);
+ kunmap_local(page_kaddr);
if (unlikely(err)) {
err = -EFAULT;
goto out;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 525758713a55..d03941cace2c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2311,6 +2311,9 @@ void zs_destroy_pool(struct zs_pool *pool)
int fg;
struct size_class *class = pool->size_class[i];
+ if (!class)
+ continue;
+
if (class->index != i)
continue;