summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/compaction.c6
-rw-r--r--mm/damon/dbgfs.c19
-rw-r--r--mm/damon/sysfs.c2
-rw-r--r--mm/filemap.c7
-rw-r--r--mm/frontswap.c3
-rw-r--r--mm/gup.c34
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/kfence/core.c2
-rw-r--r--mm/kfence/report.c1
-rw-r--r--mm/khugepaged.c14
-rw-r--r--mm/madvise.c7
-rw-r--r--mm/memcontrol.c20
-rw-r--r--mm/memory-failure.c27
-rw-r--r--mm/memory.c14
-rw-r--r--mm/migrate_device.c16
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c71
-rw-r--r--mm/page_isolation.c25
-rw-r--r--mm/readahead.c22
-rw-r--r--mm/secretmem.c2
-rw-r--r--mm/slab.c305
-rw-r--r--mm/slab.h10
-rw-r--r--mm/slab_common.c242
-rw-r--r--mm/slob.c45
-rw-r--r--mm/slub.c881
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/util.c4
-rw-r--r--mm/vmscan.c4
-rw-r--r--mm/vmstat.c37
32 files changed, 977 insertions, 874 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0331f1461f81..3897e924e40f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -579,6 +579,12 @@ config COMPACTION
it and then we would be really interested to hear about that at
linux-mm@kvack.org.
+config COMPACT_UNEVICTABLE_DEFAULT
+ int
+ depends on COMPACTION
+ default 0 if PREEMPT_RT
+ default 1
+
#
# support for free page reporting
config PAGE_REPORTING
diff --git a/mm/compaction.c b/mm/compaction.c
index 640fa76228dd..10561cb1aaad 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1727,11 +1727,7 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
-#ifdef CONFIG_PREEMPT_RT
-int sysctl_compact_unevictable_allowed __read_mostly = 0;
-#else
-int sysctl_compact_unevictable_allowed __read_mostly = 1;
-#endif
+int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index cfdf63132d5a..4e51466c4e74 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -884,6 +884,7 @@ static int dbgfs_rm_context(char *name)
struct dentry *root, *dir, **new_dirs;
struct damon_ctx **new_ctxs;
int i, j;
+ int ret = 0;
if (damon_nr_running_ctxs())
return -EBUSY;
@@ -898,14 +899,16 @@ static int dbgfs_rm_context(char *name)
new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
GFP_KERNEL);
- if (!new_dirs)
- return -ENOMEM;
+ if (!new_dirs) {
+ ret = -ENOMEM;
+ goto out_dput;
+ }
new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
GFP_KERNEL);
if (!new_ctxs) {
- kfree(new_dirs);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_new_dirs;
}
for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
@@ -925,7 +928,13 @@ static int dbgfs_rm_context(char *name)
dbgfs_ctxs = new_ctxs;
dbgfs_nr_ctxs--;
- return 0;
+ goto out_dput;
+
+out_new_dirs:
+ kfree(new_dirs);
+out_dput:
+ dput(dir);
+ return ret;
}
static ssize_t dbgfs_rm_context_write(struct file *file,
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 7488e27c87c3..bdef9682d0a0 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2182,12 +2182,12 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
if (!t)
return -ENOMEM;
+ damon_add_target(ctx, t);
if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
goto destroy_targets_out;
}
- damon_add_target(ctx, t);
err = damon_sysfs_set_regions(t, sys_target->regions);
if (err)
goto destroy_targets_out;
diff --git a/mm/filemap.c b/mm/filemap.c
index 15800334147b..c943d1b90cc2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2382,6 +2382,8 @@ retry:
static int filemap_read_folio(struct file *file, filler_t filler,
struct folio *folio)
{
+ bool workingset = folio_test_workingset(folio);
+ unsigned long pflags;
int error;
/*
@@ -2390,8 +2392,13 @@ static int filemap_read_folio(struct file *file, filler_t filler,
* fails.
*/
folio_clear_error(folio);
+
/* Start the actual read. The read will unlock the page. */
+ if (unlikely(workingset))
+ psi_memstall_enter(&pflags);
error = filler(file, folio);
+ if (unlikely(workingset))
+ psi_memstall_leave(&pflags);
if (error)
return error;
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1a97610308cb..279e55b4ed87 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -125,6 +125,9 @@ void frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly.
*/
frontswap_map_set(sis, map);
+
+ if (!frontswap_enabled())
+ return;
frontswap_ops->init(type);
}
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..00926abb4426 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2460,9 @@ pte_unmap:
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9414ee57c5b..f42bb51e023a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2894,11 +2894,9 @@ static void split_huge_pages_all(void)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
int nr_pages;
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- if (!get_page_unless_zero(page))
+ page = pfn_to_online_page(pfn);
+ if (!page || !get_page_unless_zero(page))
continue;
if (zone != page_zone(page))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e070b8593b37..0bdfc7e1c933 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3420,6 +3420,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
{
int i, nid = page_to_nid(page);
struct hstate *target_hstate;
+ struct page *subpage;
int rc = 0;
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
@@ -3453,15 +3454,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
mutex_lock(&target_hstate->resize_lock);
for (i = 0; i < pages_per_huge_page(h);
i += pages_per_huge_page(target_hstate)) {
+ subpage = nth_page(page, i);
if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_page_for_demote(page + i,
+ prep_compound_gigantic_page_for_demote(subpage,
target_hstate->order);
else
- prep_compound_page(page + i, target_hstate->order);
- set_page_private(page + i, 0);
- set_page_refcounted(page + i);
- prep_new_huge_page(target_hstate, page + i, nid);
- put_page(page + i);
+ prep_compound_page(subpage, target_hstate->order);
+ set_page_private(subpage, 0);
+ set_page_refcounted(subpage);
+ prep_new_huge_page(target_hstate, subpage, nid);
+ put_page(subpage);
}
mutex_unlock(&target_hstate->resize_lock);
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index c252081b11df..239b1b4b094f 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -864,7 +864,7 @@ static void kfence_init_enable(void)
void __init kfence_init(void)
{
- stack_hash_seed = (u32)random_get_entropy();
+ stack_hash_seed = get_random_u32();
/* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
if (!kfence_sample_interval)
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index f5a6d8ba3e21..7e496856c2eb 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -86,6 +86,7 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
/* Also the *_bulk() variants by only checking prefixes. */
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc"))
goto found;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..01dbc6dbd599 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -730,8 +730,8 @@ static void khugepaged_alloc_sleep(void)
DEFINE_WAIT(wait);
add_wait_queue(&khugepaged_wait, &wait);
- freezable_schedule_timeout_interruptible(
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait);
}
@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
diff --git a/mm/madvise.c b/mm/madvise.c
index 5f0f0948a50e..9ff51650f4f0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -451,8 +451,11 @@ regular_page:
continue;
}
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /*
+ * Do not interfere with other mappings of this page and
+ * non-LRU page.
+ */
+ if (!PageLRU(page) || page_mapcount(page) != 1)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e9fc364d5e96..bac2de4b9c42 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -597,25 +597,18 @@ static u64 flush_next_time;
*/
static void memcg_stats_lock(void)
{
-#ifdef CONFIG_PREEMPT_RT
- preempt_disable();
-#else
- VM_BUG_ON(!irqs_disabled());
-#endif
+ preempt_disable_nested();
+ VM_WARN_ON_IRQS_ENABLED();
}
static void __memcg_stats_lock(void)
{
-#ifdef CONFIG_PREEMPT_RT
- preempt_disable();
-#endif
+ preempt_disable_nested();
}
static void memcg_stats_unlock(void)
{
-#ifdef CONFIG_PREEMPT_RT
- preempt_enable();
-#endif
+ preempt_enable_nested();
}
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
* interrupt context while other caller need to have disabled interrupt.
*/
__memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ if (IS_ENABLED(CONFIG_DEBUG_VM)) {
switch (idx) {
case NR_ANON_MAPPED:
case NR_FILE_MAPPED:
@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
WARN_ON_ONCE(!in_task());
break;
default:
- WARN_ON_ONCE(!irqs_disabled());
+ VM_WARN_ON_IRQS_ENABLED();
}
}
@@ -1401,6 +1394,7 @@ static const struct memory_stat memory_stats[] = {
{ "kernel", MEMCG_KMEM },
{ "kernel_stack", NR_KERNEL_STACK_KB },
{ "pagetables", NR_PAGETABLE },
+ { "sec_pagetables", NR_SECONDARY_PAGETABLE },
{ "percpu", MEMCG_PERCPU_B },
{ "sock", MEMCG_SOCK },
{ "vmalloc", MEMCG_VMALLOC },
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 14439806b5ef..e7ac570dda75 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -345,13 +345,17 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* not much we can do. We just print a message and ignore otherwise.
*/
+#define FSDAX_INVALID_PGOFF ULONG_MAX
+
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*
- * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
- * In other cases, such as anonymous and file-backend page, the address to be
- * killed can be caculated by @p itself.
+ * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
+ * filesystem with a memory failure handler has claimed the
+ * memory_failure event. In all other cases, page->index and
+ * page->mapping are sufficient for mapping the page back to its
+ * corresponding user virtual address.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
@@ -367,11 +371,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
tk->addr = page_address_in_vma(p, vma);
if (is_zone_device_page(p)) {
- /*
- * Since page->mapping is not used for fsdax, we need
- * calculate the address based on the vma.
- */
- if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+ if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
} else
@@ -523,7 +523,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, 0, vma, to_kill);
+ add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
+ to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -559,7 +560,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, 0, vma, to_kill);
+ add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
+ to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -743,6 +745,9 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
};
priv.tk.tsk = p;
+ if (!p->mm)
+ return -EFAULT;
+
mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
(void *)&priv);
@@ -1928,7 +1933,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* Call driver's implementation to handle the memory failure, otherwise
* fall back to generic handler.
*/
- if (pgmap->ops->memory_failure) {
+ if (pgmap_has_memory_failure(pgmap)) {
rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
/*
* Fall back to generic handler too if operation is not
diff --git a/mm/memory.c b/mm/memory.c
index 4ba73f5aa8bb..a78814413ac0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4386,14 +4386,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- ret = 0;
+
/* Re-check under ptl */
- if (likely(!vmf_pte_changed(vmf)))
+ if (likely(!vmf_pte_changed(vmf))) {
do_set_pte(vmf, page, vmf->address);
- else
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+
+ ret = 0;
+ } else {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
ret = VM_FAULT_NOPAGE;
+ }
- update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 27fb37d65476..dbf6c7a7a7c9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -193,10 +194,10 @@ again:
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ again:
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
@@ -254,13 +259,14 @@ next:
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 032a7bf8d259..7e9d8d857ecc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1933,6 +1933,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
wb_put(wb);
return ret;
}
+EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags);
/**
* balance_dirty_pages_ratelimited - balance dirty memory state.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e5486d47406e..08522a831c7a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4708,6 +4708,30 @@ void fs_reclaim_release(gfp_t gfp_mask)
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -5001,6 +5025,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
int reserve_flags;
/*
@@ -5011,11 +5036,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry_cpuset:
+restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
@@ -5187,9 +5213,13 @@ retry:
goto retry;
- /* Deal with possible cpuset update races before we start OOM killing */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -5209,9 +5239,13 @@ retry:
}
nopage:
- /* Deal with possible cpuset update races before we fail */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@ -5706,6 +5740,18 @@ refill:
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
+ if (unlikely(offset < 0)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
}
nc->pagecnt_bias--;
@@ -6039,7 +6085,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu\n"
+ " sec_pagetables:%lu bounce:%lu\n"
" kernel_misc_reclaimable:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
@@ -6056,6 +6103,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_SECONDARY_PAGETABLE),
global_zone_page_state(NR_BOUNCE),
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
@@ -6089,6 +6137,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" shadow_call_stack:%lukB"
#endif
" pagetables:%lukB"
+ " sec_pagetables:%lukB"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -6114,6 +6163,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
+ K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
@@ -6514,9 +6564,8 @@ static void __build_all_zonelists(void *data)
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
- static DEFINE_SPINLOCK(lock);
- spin_lock(&lock);
+ write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -6553,7 +6602,7 @@ static void __build_all_zonelists(void *data)
#endif
}
- spin_unlock(&lock);
+ write_sequnlock(&zonelist_update_seq);
}
static noinline void __init
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 9d73dc38e3d7..eb3a68ca92ad 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -288,6 +288,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock()
+ * @migratetype: migrate type to set in error recovery.
*
* Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same
@@ -302,9 +303,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page.
*/
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
+ gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
+ int migratetype)
{
- unsigned char saved_mt;
unsigned long start_pfn;
unsigned long isolate_pageblock;
unsigned long pfn;
@@ -328,13 +329,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
zone->zone_start_pfn);
- saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+ if (skip_isolation) {
+ int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
- if (skip_isolation)
- VM_BUG_ON(!is_migrate_isolate(saved_mt));
- else {
- ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
- isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+ VM_BUG_ON(!is_migrate_isolate(mt));
+ } else {
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
+ flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
if (ret)
return ret;
@@ -475,7 +476,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
failed:
/* restore the original migratetype */
if (!skip_isolation)
- unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
return -EBUSY;
}
@@ -537,7 +538,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
+ ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
+ skip_isolation, migratetype);
if (ret)
return ret;
@@ -545,7 +547,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
+ ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
+ skip_isolation, migratetype);
if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
return ret;
diff --git a/mm/readahead.c b/mm/readahead.c
index fdcd28cbd92d..b10f0cf81d80 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -122,6 +122,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>
+#include <linux/psi.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
@@ -152,6 +153,8 @@ static void read_pages(struct readahead_control *rac)
if (!readahead_count(rac))
return;
+ if (unlikely(rac->_workingset))
+ psi_memstall_enter(&rac->_pflags);
blk_start_plug(&plug);
if (aops->readahead) {
@@ -179,6 +182,9 @@ static void read_pages(struct readahead_control *rac)
}
blk_finish_plug(&plug);
+ if (unlikely(rac->_workingset))
+ psi_memstall_leave(&rac->_pflags);
+ rac->_workingset = false;
BUG_ON(readahead_count(rac));
}
@@ -252,6 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
}
if (i == nr_to_read - lookahead_size)
folio_set_readahead(folio);
+ ractl->_workingset |= folio_test_workingset(folio);
ractl->_nr_pages++;
}
@@ -480,11 +487,14 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
if (index == mark)
folio_set_readahead(folio);
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
- if (err)
+ if (err) {
folio_put(folio);
- else
- ractl->_nr_pages += 1UL << order;
- return err;
+ return err;
+ }
+
+ ractl->_nr_pages += 1UL << order;
+ ractl->_workingset |= folio_test_workingset(folio);
+ return 0;
}
void page_cache_ra_order(struct readahead_control *ractl,
@@ -826,6 +836,10 @@ void readahead_expand(struct readahead_control *ractl,
put_page(page);
return;
}
+ if (unlikely(PageWorkingset(page)) && !ractl->_workingset) {
+ ractl->_workingset = true;
+ psi_memstall_enter(&ractl->_pflags);
+ }
ractl->_nr_pages++;
if (ra) {
ra->size++;
diff --git a/mm/secretmem.c b/mm/secretmem.c
index e3e9590c6fb3..3f7154099795 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -285,7 +285,7 @@ static int secretmem_init(void)
secretmem_mnt = kern_mount(&secretmem_fs);
if (IS_ERR(secretmem_mnt))
- ret = PTR_ERR(secretmem_mnt);
+ return PTR_ERR(secretmem_mnt);
/* prevent secretmem mappings from ever getting PROT_EXEC */
secretmem_mnt->mnt_flags |= MNT_NOEXEC;
diff --git a/mm/slab.c b/mm/slab.c
index 10e96137b44f..a5486ff8362a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3181,84 +3181,46 @@ must_grow:
}
static __always_inline void *
-slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
- unsigned long caller)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- unsigned long save_flags;
- void *ptr;
+ void *objp = NULL;
int slab_node = numa_mem_id();
- struct obj_cgroup *objcg = NULL;
- bool init = false;
- flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags);
- if (unlikely(!cachep))
- return NULL;
-
- ptr = kfence_alloc(cachep, orig_size, flags);
- if (unlikely(ptr))
- goto out_hooks;
-
- local_irq_save(save_flags);
-
- if (nodeid == NUMA_NO_NODE)
- nodeid = slab_node;
-
- if (unlikely(!get_node(cachep, nodeid))) {
- /* Node not bootstrapped yet */
- ptr = fallback_alloc(cachep, flags);
- goto out;
- }
-
- if (nodeid == slab_node) {
+ if (nodeid == NUMA_NO_NODE) {
+ if (current->mempolicy || cpuset_do_slab_mem_spread()) {
+ objp = alternate_node_alloc(cachep, flags);
+ if (objp)
+ goto out;
+ }
/*
* Use the locally cached objects if possible.
* However ____cache_alloc does not allow fallback
* to other nodes. It may fail while we still have
* objects on other nodes available.
*/
- ptr = ____cache_alloc(cachep, flags);
- if (ptr)
- goto out;
- }
- /* ___cache_alloc_node can fall back to other nodes */
- ptr = ____cache_alloc_node(cachep, flags, nodeid);
-out:
- local_irq_restore(save_flags);
- ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- init = slab_want_init_on_alloc(flags, cachep);
-
-out_hooks:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
- return ptr;
-}
-
-static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
-{
- void *objp;
-
- if (current->mempolicy || cpuset_do_slab_mem_spread()) {
- objp = alternate_node_alloc(cache, flags);
- if (objp)
- goto out;
+ objp = ____cache_alloc(cachep, flags);
+ nodeid = slab_node;
+ } else if (nodeid == slab_node) {
+ objp = ____cache_alloc(cachep, flags);
+ } else if (!get_node(cachep, nodeid)) {
+ /* Node not bootstrapped yet */
+ objp = fallback_alloc(cachep, flags);
+ goto out;
}
- objp = ____cache_alloc(cache, flags);
/*
* We may just have run out of memory on the local node.
* ____cache_alloc_node() knows how to locate memory on other nodes
*/
if (!objp)
- objp = ____cache_alloc_node(cache, flags, numa_mem_id());
-
+ objp = ____cache_alloc_node(cachep, flags, nodeid);
out:
return objp;
}
#else
static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused)
{
return ____cache_alloc(cachep, flags);
}
@@ -3266,8 +3228,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */
static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
- size_t orig_size, unsigned long caller)
+slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
+ int nodeid, size_t orig_size, unsigned long caller)
{
unsigned long save_flags;
void *objp;
@@ -3284,7 +3246,7 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
goto out;
local_irq_save(save_flags);
- objp = __do_cache_alloc(cachep, flags);
+ objp = __do_cache_alloc(cachep, flags, nodeid);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
@@ -3295,6 +3257,14 @@ out:
return objp;
}
+static __always_inline void *
+slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
+ size_t orig_size, unsigned long caller)
+{
+ return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size,
+ caller);
+}
+
/*
* Caller needs to acquire correct kmem_cache_node's list_lock
* @list: List of detached free slabs should be freed by caller
@@ -3470,8 +3440,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
{
void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc(_RET_IP_, ret, cachep,
- cachep->object_size, cachep->size, flags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE);
return ret;
}
@@ -3521,7 +3490,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
local_irq_disable();
for (i = 0; i < size; i++) {
- void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
+ void *objp = kfence_alloc(s, s->object_size, flags) ?:
+ __do_cache_alloc(s, flags, NUMA_NO_NODE);
if (unlikely(!objp))
goto error;
@@ -3548,23 +3518,6 @@ error:
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-#ifdef CONFIG_TRACING
-void *
-kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
-{
- void *ret;
-
- ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(_RET_IP_, ret, cachep,
- size, cachep->size, flags);
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_trace);
-#endif
-
-#ifdef CONFIG_NUMA
/**
* kmem_cache_alloc_node - Allocate an object on the specified node
* @cachep: The cache to allocate from.
@@ -3580,66 +3533,22 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
+ void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep,
- cachep->object_size, cachep->size,
- flags, nodeid);
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
- gfp_t flags,
- int nodeid,
- size_t size)
+void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid, size_t orig_size,
+ unsigned long caller)
{
- void *ret;
-
- ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc_node(_RET_IP_, ret, cachep,
- size, cachep->size,
- flags, nodeid);
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
-#endif
-
-static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
-{
- struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
- ret = kasan_kmalloc(cachep, ret, size, flags);
-
- return ret;
+ return slab_alloc_node(cachep, NULL, flags, nodeid,
+ orig_size, caller);
}
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- return __do_kmalloc_node(size, flags, node, _RET_IP_);
-}
-EXPORT_SYMBOL(__kmalloc_node);
-
-void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
- int node, unsigned long caller)
-{
- return __do_kmalloc_node(size, flags, node, caller);
-}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_PRINTK
void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
@@ -3662,45 +3571,25 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
}
#endif
-/**
- * __do_kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kmalloc).
- * @caller: function caller for debug tracking of the caller
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
- unsigned long caller)
+static __always_inline
+void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
- struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = slab_alloc(cachep, NULL, flags, size, caller);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(caller, ret, cachep,
- size, cachep->size, flags);
-
- return ret;
-}
+ unsigned long flags;
-void *__kmalloc(size_t size, gfp_t flags)
-{
- return __do_kmalloc(size, flags, _RET_IP_);
+ local_irq_save(flags);
+ debug_check_no_locks_freed(objp, cachep->object_size);
+ if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, cachep->object_size);
+ __cache_free(cachep, objp, caller);
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL(__kmalloc);
-void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
+void __kmem_cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
- return __do_kmalloc(size, flags, caller);
+ __do_kmem_cache_free(cachep, objp, caller);
}
-EXPORT_SYMBOL(__kmalloc_track_caller);
/**
* kmem_cache_free - Deallocate an object
@@ -3712,34 +3601,38 @@ EXPORT_SYMBOL(__kmalloc_track_caller);
*/
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
- unsigned long flags;
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;
- trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
- local_irq_save(flags);
- debug_check_no_locks_freed(objp, cachep->object_size);
- if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(objp, cachep->object_size);
- __cache_free(cachep, objp, _RET_IP_);
- local_irq_restore(flags);
+ trace_kmem_cache_free(_RET_IP_, objp, cachep);
+ __do_kmem_cache_free(cachep, objp, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
{
- struct kmem_cache *s;
- size_t i;
local_irq_disable();
- for (i = 0; i < size; i++) {
+ for (int i = 0; i < size; i++) {
void *objp = p[i];
+ struct kmem_cache *s;
- if (!orig_s) /* called via kfree_bulk */
- s = virt_to_cache(objp);
- else
+ if (!orig_s) {
+ struct folio *folio = virt_to_folio(objp);
+
+ /* called via kfree_bulk */
+ if (!folio_test_slab(folio)) {
+ local_irq_enable();
+ free_large_kmalloc(folio, objp);
+ local_irq_disable();
+ continue;
+ }
+ s = folio_slab(folio)->slab_cache;
+ } else {
s = cache_from_obj(orig_s, objp);
+ }
+
if (!s)
continue;
@@ -3755,39 +3648,6 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-/**
- * kfree - free previously allocated memory
- * @objp: pointer returned by kmalloc.
- *
- * If @objp is NULL, no operation is performed.
- *
- * Don't free memory not originally allocated by kmalloc()
- * or you will run into trouble.
- */
-void kfree(const void *objp)
-{
- struct kmem_cache *c;
- unsigned long flags;
-
- trace_kfree(_RET_IP_, objp);
-
- if (unlikely(ZERO_OR_NULL_PTR(objp)))
- return;
- local_irq_save(flags);
- kfree_debugcheck(objp);
- c = virt_to_cache(objp);
- if (!c) {
- local_irq_restore(flags);
- return;
- }
- debug_check_no_locks_freed(objp, c->object_size);
-
- debug_check_no_obj_freed(objp, c->object_size);
- __cache_free(c, (void *)objp, _RET_IP_);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(kfree);
-
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
@@ -4190,28 +4050,3 @@ void __check_heap_object(const void *ptr, unsigned long n,
usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
-
-/**
- * __ksize -- Uninstrumented ksize.
- * @objp: pointer to the object
- *
- * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
- * safety checks as ksize() with KASAN instrumentation enabled.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
-size_t __ksize(const void *objp)
-{
- struct kmem_cache *c;
- size_t size;
-
- BUG_ON(!objp);
- if (unlikely(objp == ZERO_SIZE_PTR))
- return 0;
-
- c = virt_to_cache(objp);
- size = c ? c->object_size : 0;
-
- return size;
-}
-EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab.h b/mm/slab.h
index 4ec82bec15ec..65023f000d42 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -273,6 +273,11 @@ void create_kmalloc_caches(slab_flags_t);
/* Find the kmalloc slab corresponding for a certain size */
struct kmem_cache *kmalloc_slab(size_t, gfp_t);
+
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t orig_size,
+ unsigned long caller);
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller);
#endif
gfp_t kmalloc_fix_flags(gfp_t flags);
@@ -658,8 +663,13 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
print_tracking(cachep, x);
return cachep;
}
+
+void free_large_kmalloc(struct folio *folio, void *object);
+
#endif /* CONFIG_SLOB */
+size_t __ksize(const void *objp);
+
static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifndef CONFIG_SLUB
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 07b948288f84..9ad97ae73a0a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -475,6 +475,7 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
int refcnt;
+ bool rcu_set;
if (unlikely(!s) || !kasan_check_byte(s))
return;
@@ -482,6 +483,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
cpus_read_lock();
mutex_lock(&slab_mutex);
+ rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
+
refcnt = --s->refcount;
if (refcnt)
goto out_unlock;
@@ -492,7 +495,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
out_unlock:
mutex_unlock(&slab_mutex);
cpus_read_unlock();
- if (!refcnt && !(s->flags & SLAB_TYPESAFE_BY_RCU))
+ if (!refcnt && !rcu_set)
kmem_cache_release(s);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -508,13 +511,9 @@ EXPORT_SYMBOL(kmem_cache_destroy);
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
- int ret;
-
-
kasan_cache_shrink(cachep);
- ret = __kmem_cache_shrink(cachep);
- return ret;
+ return __kmem_cache_shrink(cachep);
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -662,7 +661,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
if (!s)
panic("Out of memory when creating slab %s\n", name);
- create_boot_cache(s, name, size, flags, useroffset, usersize);
+ create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset,
+ usersize);
kasan_cache_create_kmalloc(s);
list_add(&s->list, &slab_caches);
s->refcount = 1;
@@ -734,6 +734,26 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
return kmalloc_caches[kmalloc_type(flags)][index];
}
+size_t kmalloc_size_roundup(size_t size)
+{
+ struct kmem_cache *c;
+
+ /* Short-circuit the 0 size case. */
+ if (unlikely(size == 0))
+ return 0;
+ /* Short-circuit saturated "too-large" case. */
+ if (unlikely(size == SIZE_MAX))
+ return SIZE_MAX;
+ /* Above the smaller buckets, size is a multiple of page size. */
+ if (size > KMALLOC_MAX_CACHE_SIZE)
+ return PAGE_SIZE << get_order(size);
+
+ /* The flags don't matter since size_index is common to all. */
+ c = kmalloc_slab(size, GFP_KERNEL);
+ return c ? c->object_size : 0;
+}
+EXPORT_SYMBOL(kmalloc_size_roundup);
+
#ifdef CONFIG_ZONE_DMA
#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
#else
@@ -757,8 +777,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
- * kmalloc-32M.
+ * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
+ * kmalloc-2M.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(0, 0),
@@ -782,11 +802,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(262144, 256k),
INIT_KMALLOC_INFO(524288, 512k),
INIT_KMALLOC_INFO(1048576, 1M),
- INIT_KMALLOC_INFO(2097152, 2M),
- INIT_KMALLOC_INFO(4194304, 4M),
- INIT_KMALLOC_INFO(8388608, 8M),
- INIT_KMALLOC_INFO(16777216, 16M),
- INIT_KMALLOC_INFO(33554432, 32M)
+ INIT_KMALLOC_INFO(2097152, 2M)
};
/*
@@ -899,6 +915,155 @@ void __init create_kmalloc_caches(slab_flags_t flags)
/* Kmalloc array is now usable */
slab_state = UP;
}
+
+void free_large_kmalloc(struct folio *folio, void *object)
+{
+ unsigned int order = folio_order(folio);
+
+ if (WARN_ON_ONCE(order == 0))
+ pr_warn_once("object pointer: 0x%p\n", object);
+
+ kmemleak_free(object);
+ kasan_kfree_large(object);
+
+ mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(folio_page(folio, 0), order);
+}
+
+static void *__kmalloc_large_node(size_t size, gfp_t flags, int node);
+static __always_inline
+void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ ret = __kmalloc_large_node(size, flags, node);
+ trace_kmalloc(_RET_IP_, ret, size,
+ PAGE_SIZE << get_order(size), flags, node);
+ return ret;
+ }
+
+ s = kmalloc_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = __kmem_cache_alloc_node(s, flags, node, size, caller);
+ ret = kasan_kmalloc(s, ret, size, flags);
+ trace_kmalloc(_RET_IP_, ret, size, s->size, flags, node);
+ return ret;
+}
+
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+
+/**
+ * kfree - free previously allocated memory
+ * @object: pointer returned by kmalloc.
+ *
+ * If @object is NULL, no operation is performed.
+ *
+ * Don't free memory not originally allocated by kmalloc()
+ * or you will run into trouble.
+ */
+void kfree(const void *object)
+{
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+
+ trace_kfree(_RET_IP_, object);
+
+ if (unlikely(ZERO_OR_NULL_PTR(object)))
+ return;
+
+ folio = virt_to_folio(object);
+ if (unlikely(!folio_test_slab(folio))) {
+ free_large_kmalloc(folio, (void *)object);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ __kmem_cache_free(s, (void *)object, _RET_IP_);
+}
+EXPORT_SYMBOL(kfree);
+
+/**
+ * __ksize -- Report full size of underlying allocation
+ * @objp: pointer to the object
+ *
+ * This should only be used internally to query the true size of allocations.
+ * It is not meant to be a way to discover the usable size of an allocation
+ * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
+ * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
+ * and/or FORTIFY_SOURCE.
+ *
+ * Return: size of the actual memory used by @objp in bytes
+ */
+size_t __ksize(const void *object)
+{
+ struct folio *folio;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ folio = virt_to_folio(object);
+
+ if (unlikely(!folio_test_slab(folio))) {
+ if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
+ return 0;
+ if (WARN_ON(object != folio_address(folio)))
+ return 0;
+ return folio_size(folio);
+ }
+
+ return slab_ksize(folio_slab(folio)->slab_cache);
+}
+
+#ifdef CONFIG_TRACING
+void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+ void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE,
+ size, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
+
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_trace);
+
+void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t size)
+{
+ void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
+
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_node_trace);
+#endif /* !CONFIG_TRACING */
#endif /* !CONFIG_SLOB */
gfp_t kmalloc_fix_flags(gfp_t flags)
@@ -918,37 +1083,50 @@ gfp_t kmalloc_fix_flags(gfp_t flags)
* directly to the page allocator. We use __GFP_COMP, because we will need to
* know the allocation order to free the pages properly in kfree.
*/
-void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+
+static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- void *ret = NULL;
struct page *page;
+ void *ptr = NULL;
+ unsigned int order = get_order(size);
if (unlikely(flags & GFP_SLAB_BUG_MASK))
flags = kmalloc_fix_flags(flags);
flags |= __GFP_COMP;
- page = alloc_pages(flags, order);
- if (likely(page)) {
- ret = page_address(page);
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
+ ptr = page_address(page);
mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
}
- ret = kasan_kmalloc_large(ret, size, flags);
- /* As ret might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ret, size, 1, flags);
+
+ ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ptr, size, 1, flags);
+
+ return ptr;
+}
+
+void *kmalloc_large(size_t size, gfp_t flags)
+{
+ void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE);
+
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
+ flags, NUMA_NO_NODE);
return ret;
}
-EXPORT_SYMBOL(kmalloc_order);
+EXPORT_SYMBOL(kmalloc_large);
-#ifdef CONFIG_TRACING
-void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- void *ret = kmalloc_order(size, flags, order);
- trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags);
+ void *ret = __kmalloc_large_node(size, flags, node);
+
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
+ flags, node);
return ret;
}
-EXPORT_SYMBOL(kmalloc_order_trace);
-#endif
+EXPORT_SYMBOL(kmalloc_large_node);
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Randomize a generic freelist */
@@ -1147,8 +1325,8 @@ module_init(slab_proc_init);
#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
-static __always_inline void *__do_krealloc(const void *p, size_t new_size,
- gfp_t flags)
+static __always_inline __realloc_size(2) void *
+__do_krealloc(const void *p, size_t new_size, gfp_t flags)
{
void *ret;
size_t ks;
@@ -1280,8 +1458,6 @@ EXPORT_SYMBOL(ksize);
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
-EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/slob.c b/mm/slob.c
index 2bd4f476c340..fe567fcfa3a3 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -507,8 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
*m = size;
ret = (void *)m + minalign;
- trace_kmalloc_node(caller, ret, NULL,
- size, size + minalign, gfp, node);
+ trace_kmalloc(caller, ret, size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -516,8 +515,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
gfp |= __GFP_COMP;
ret = slob_new_pages(gfp, order, node);
- trace_kmalloc_node(caller, ret, NULL,
- size, PAGE_SIZE << order, gfp, node);
+ trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node);
}
kmemleak_alloc(ret, size, 1, gfp);
@@ -530,20 +528,12 @@ void *__kmalloc(size_t size, gfp_t gfp)
}
EXPORT_SYMBOL(__kmalloc);
-void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
-{
- return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
int node, unsigned long caller)
{
return __do_kmalloc_node(size, gfp, node, caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif
void kfree(const void *block)
{
@@ -574,6 +564,20 @@ void kfree(const void *block)
}
EXPORT_SYMBOL(kfree);
+size_t kmalloc_size_roundup(size_t size)
+{
+ /* Short-circuit the 0 size case. */
+ if (unlikely(size == 0))
+ return 0;
+ /* Short-circuit saturated "too-large" case. */
+ if (unlikely(size == SIZE_MAX))
+ return SIZE_MAX;
+
+ return ALIGN(size, ARCH_KMALLOC_MINALIGN);
+}
+
+EXPORT_SYMBOL(kmalloc_size_roundup);
+
/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
size_t __ksize(const void *block)
{
@@ -594,7 +598,6 @@ size_t __ksize(const void *block)
m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
}
-EXPORT_SYMBOL(__ksize);
int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
{
@@ -602,6 +605,9 @@ int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
/* leave room for rcu footer at the end of object */
c->size += sizeof(struct slob_rcu);
}
+
+ /* Actual size allocated */
+ c->size = SLOB_UNITS(c->size) * SLOB_UNIT;
c->flags = flags;
return 0;
}
@@ -616,14 +622,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node, 0);
- trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
- SLOB_UNITS(c->size) * SLOB_UNIT,
- flags, node);
+ trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node);
} else {
b = slob_new_pages(flags, get_order(c->size), node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
- PAGE_SIZE << get_order(c->size),
- flags, node);
+ trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node);
}
if (b && c->ctor) {
@@ -647,7 +649,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_
return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
}
EXPORT_SYMBOL(kmem_cache_alloc_lru);
-#ifdef CONFIG_NUMA
+
void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
return __do_kmalloc_node(size, gfp, node, _RET_IP_);
@@ -659,7 +661,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
return slob_alloc_node(cachep, gfp, node);
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
static void __kmem_cache_free(void *b, int size)
{
@@ -680,7 +681,7 @@ static void kmem_rcu_free(struct rcu_head *head)
void kmem_cache_free(struct kmem_cache *c, void *b)
{
kmemleak_free_recursive(b, c->flags);
- trace_kmem_cache_free(_RET_IP_, b, c->name);
+ trace_kmem_cache_free(_RET_IP_, b, c);
if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
struct slob_rcu *slob_rcu;
slob_rcu = b + (c->size - sizeof(struct slob_rcu));
diff --git a/mm/slub.c b/mm/slub.c
index 862dbd9af4f5..2a6b3f31ce7e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -50,7 +50,7 @@
* 1. slab_mutex (Global Mutex)
* 2. node->list_lock (Spinlock)
* 3. kmem_cache->cpu_slab->lock (Local lock)
- * 4. slab_lock(slab) (Only on some arches or for debugging)
+ * 4. slab_lock(slab) (Only on some arches)
* 5. object_map_lock (Only for debugging)
*
* slab_mutex
@@ -64,8 +64,9 @@
* The slab_lock is a wrapper around the page lock, thus it is a bit
* spinlock.
*
- * The slab_lock is only used for debugging and on arches that do not
- * have the ability to do a cmpxchg_double. It only protects:
+ * The slab_lock is only used on arches that do not have the ability
+ * to do a cmpxchg_double. It only protects:
+ *
* A. slab->freelist -> List of free objects in a slab
* B. slab->inuse -> Number of objects in use
* C. slab->objects -> Number of objects in slab
@@ -94,15 +95,20 @@
* allocating a long series of objects that fill up slabs does not require
* the list lock.
*
+ * For debug caches, all allocations are forced to go through a list_lock
+ * protected region to serialize against concurrent validation.
+ *
* cpu_slab->lock local lock
*
* This locks protect slowpath manipulation of all kmem_cache_cpu fields
* except the stat counters. This is a percpu structure manipulated only by
* the local cpu, so the lock protects against being preempted or interrupted
* by an irq. Fast path operations rely on lockless operations instead.
- * On PREEMPT_RT, the local lock does not actually disable irqs (and thus
- * prevent the lockless operations), so fastpath operations also need to take
- * the lock and are no longer lockless.
+ *
+ * On PREEMPT_RT, the local lock neither disables interrupts nor preemption
+ * which means the lockless fastpath cannot be used as it might interfere with
+ * an in-progress slow path operations. In this case the local lock is always
+ * taken but it still utilizes the freelist for the common operations.
*
* lockless fastpaths
*
@@ -163,8 +169,9 @@
* function call even on !PREEMPT_RT, use inline preempt_disable() there.
*/
#ifndef CONFIG_PREEMPT_RT
-#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
-#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
+#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
+#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
+#define USE_LOCKLESS_FAST_PATH() (true)
#else
#define slub_get_cpu_ptr(var) \
({ \
@@ -176,6 +183,7 @@ do { \
(void)(var); \
migrate_enable(); \
} while (0)
+#define USE_LOCKLESS_FAST_PATH() (false)
#endif
#ifdef CONFIG_SLUB_DEBUG
@@ -186,11 +194,24 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
#endif /* CONFIG_SLUB_DEBUG */
+/* Structure holding parameters for get_partial() call chain */
+struct partial_context {
+ struct slab **slab;
+ gfp_t flags;
+ unsigned int orig_size;
+};
+
static inline bool kmem_cache_debug(struct kmem_cache *s)
{
return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}
+static inline bool slub_debug_orig_size(struct kmem_cache *s)
+{
+ return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
+ (s->flags & SLAB_KMALLOC));
+}
+
void *fixup_red_left(struct kmem_cache *s, void *p)
{
if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
@@ -310,6 +331,11 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
*/
static nodemask_t slab_nodes;
+/*
+ * Workqueue used for flush_cpu_slab().
+ */
+static struct workqueue_struct *flushwq;
+
/********************************************************************
* Core slab cache functions
*******************************************************************/
@@ -442,7 +468,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
/*
* Per slab locking using the pagelock
*/
-static __always_inline void __slab_lock(struct slab *slab)
+static __always_inline void slab_lock(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -450,7 +476,7 @@ static __always_inline void __slab_lock(struct slab *slab)
bit_spin_lock(PG_locked, &page->flags);
}
-static __always_inline void __slab_unlock(struct slab *slab)
+static __always_inline void slab_unlock(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -458,31 +484,19 @@ static __always_inline void __slab_unlock(struct slab *slab)
__bit_spin_unlock(PG_locked, &page->flags);
}
-static __always_inline void slab_lock(struct slab *slab, unsigned long *flags)
-{
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_save(*flags);
- __slab_lock(slab);
-}
-
-static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags)
-{
- __slab_unlock(slab);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_restore(*flags);
-}
-
/*
* Interrupts must be disabled (for the fallback code to work right), typically
- * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
- * so we disable interrupts as part of slab_[un]lock().
+ * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
+ * part of bit_spin_lock(), is sufficient because the policy is not to allow any
+ * allocation/ free operation in hardirq context. Therefore nothing can
+ * interrupt the operation.
*/
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ if (USE_LOCKLESS_FAST_PATH())
lockdep_assert_irqs_disabled();
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
@@ -494,18 +508,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab
} else
#endif
{
- /* init to 0 to prevent spurious warnings */
- unsigned long flags = 0;
-
- slab_lock(slab, &flags);
+ slab_lock(slab);
if (slab->freelist == freelist_old &&
slab->counters == counters_old) {
slab->freelist = freelist_new;
slab->counters = counters_new;
- slab_unlock(slab, &flags);
+ slab_unlock(slab);
return true;
}
- slab_unlock(slab, &flags);
+ slab_unlock(slab);
}
cpu_relax();
@@ -536,16 +547,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
unsigned long flags;
local_irq_save(flags);
- __slab_lock(slab);
+ slab_lock(slab);
if (slab->freelist == freelist_old &&
slab->counters == counters_old) {
slab->freelist = freelist_new;
slab->counters = counters_new;
- __slab_unlock(slab);
+ slab_unlock(slab);
local_irq_restore(flags);
return true;
}
- __slab_unlock(slab);
+ slab_unlock(slab);
local_irq_restore(flags);
}
@@ -561,7 +572,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
#ifdef CONFIG_SLUB_DEBUG
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
-static DEFINE_RAW_SPINLOCK(object_map_lock);
+static DEFINE_SPINLOCK(object_map_lock);
static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
struct slab *slab)
@@ -595,30 +606,6 @@ static bool slab_add_kunit_errors(void)
static inline bool slab_add_kunit_errors(void) { return false; }
#endif
-/*
- * Determine a map of objects in use in a slab.
- *
- * Node listlock must be held to guarantee that the slab does
- * not vanish from under us.
- */
-static unsigned long *get_map(struct kmem_cache *s, struct slab *slab)
- __acquires(&object_map_lock)
-{
- VM_BUG_ON(!irqs_disabled());
-
- raw_spin_lock(&object_map_lock);
-
- __fill_map(object_map, s, slab);
-
- return object_map;
-}
-
-static void put_map(unsigned long *map) __releases(&object_map_lock)
-{
- VM_BUG_ON(map != object_map);
- raw_spin_unlock(&object_map_lock);
-}
-
static inline unsigned int size_from_object(struct kmem_cache *s)
{
if (s->flags & SLAB_RED_ZONE)
@@ -816,6 +803,39 @@ static void print_slab_info(const struct slab *slab)
folio_flags(folio, 0));
}
+/*
+ * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
+ * family will round up the real request size to these fixed ones, so
+ * there could be an extra area than what is requested. Save the original
+ * request size in the meta data area, for better debug and sanity check.
+ */
+static inline void set_orig_size(struct kmem_cache *s,
+ void *object, unsigned int orig_size)
+{
+ void *p = kasan_reset_tag(object);
+
+ if (!slub_debug_orig_size(s))
+ return;
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ *(unsigned int *)p = orig_size;
+}
+
+static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
+{
+ void *p = kasan_reset_tag(object);
+
+ if (!slub_debug_orig_size(s))
+ return s->object_size;
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ return *(unsigned int *)p;
+}
+
static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
struct va_format vaf;
@@ -875,6 +895,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
+ if (slub_debug_orig_size(s))
+ off += sizeof(unsigned int);
+
off += kasan_metadata_size(s);
if (off != size_from_object(s))
@@ -1008,7 +1031,8 @@ skip_bug_print:
*
* A. Free pointer (if we cannot overwrite object on free)
* B. Tracking data for SLAB_STORE_USER
- * C. Padding to reach required alignment boundary or at minimum
+ * C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
+ * D. Padding to reach required alignment boundary or at minimum
* one word if debugging is on to be able to detect writes
* before the word boundary.
*
@@ -1026,10 +1050,14 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
{
unsigned long off = get_info_end(s); /* The end of info */
- if (s->flags & SLAB_STORE_USER)
+ if (s->flags & SLAB_STORE_USER) {
/* We also have user information there */
off += 2 * sizeof(struct track);
+ if (s->flags & SLAB_KMALLOC)
+ off += sizeof(unsigned int);
+ }
+
off += kasan_metadata_size(s);
if (size_from_object(s) == off)
@@ -1324,18 +1352,16 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
}
static noinline int alloc_debug_processing(struct kmem_cache *s,
- struct slab *slab,
- void *object, unsigned long addr)
+ struct slab *slab, void *object, int orig_size)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
if (!alloc_consistency_checks(s, slab, object))
goto bad;
}
- /* Success perform special debug activities for allocs */
- if (s->flags & SLAB_STORE_USER)
- set_track(s, object, TRACK_ALLOC, addr);
+ /* Success. Perform special debug activities for allocs */
trace(s, slab, object, 1);
+ set_orig_size(s, object, orig_size);
init_object(s, object, SLUB_RED_ACTIVE);
return 1;
@@ -1385,63 +1411,6 @@ static inline int free_consistency_checks(struct kmem_cache *s,
return 1;
}
-/* Supports checking bulk free of a constructed freelist */
-static noinline int free_debug_processing(
- struct kmem_cache *s, struct slab *slab,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr)
-{
- struct kmem_cache_node *n = get_node(s, slab_nid(slab));
- void *object = head;
- int cnt = 0;
- unsigned long flags, flags2;
- int ret = 0;
- depot_stack_handle_t handle = 0;
-
- if (s->flags & SLAB_STORE_USER)
- handle = set_track_prepare();
-
- spin_lock_irqsave(&n->list_lock, flags);
- slab_lock(slab, &flags2);
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!check_slab(s, slab))
- goto out;
- }
-
-next_object:
- cnt++;
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!free_consistency_checks(s, slab, object, addr))
- goto out;
- }
-
- if (s->flags & SLAB_STORE_USER)
- set_track_update(s, object, TRACK_FREE, addr, handle);
- trace(s, slab, object, 0);
- /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
- init_object(s, object, SLUB_RED_INACTIVE);
-
- /* Reached end of constructed freelist yet? */
- if (object != tail) {
- object = get_freepointer(s, object);
- goto next_object;
- }
- ret = 1;
-
-out:
- if (cnt != bulk_cnt)
- slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
- bulk_cnt, cnt);
-
- slab_unlock(slab, &flags2);
- spin_unlock_irqrestore(&n->list_lock, flags);
- if (!ret)
- slab_fix(s, "Object at 0x%p not freed", object);
- return ret;
-}
-
/*
* Parse a block of slub_debug options. Blocks are delimited by ';'
*
@@ -1661,16 +1630,18 @@ static inline
void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
- struct slab *slab, void *object, unsigned long addr) { return 0; }
+ struct slab *slab, void *object, int orig_size) { return 0; }
-static inline int free_debug_processing(
+static inline void free_debug_processing(
struct kmem_cache *s, struct slab *slab,
void *head, void *tail, int bulk_cnt,
- unsigned long addr) { return 0; }
+ unsigned long addr) {}
static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
+static inline void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr) {}
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -1704,20 +1675,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
- ptr = kasan_kmalloc_large(ptr, size, flags);
- /* As ptr might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ptr, size, 1, flags);
- return ptr;
-}
-
-static __always_inline void kfree_hook(void *x)
-{
- kmemleak_free(x);
- kasan_kfree_large(x);
-}
-
static __always_inline bool slab_free_hook(struct kmem_cache *s,
void *x, bool init)
{
@@ -1976,11 +1933,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab))
- goto out;
+ return NULL;
stat(s, ORDER_FALLBACK);
}
slab->objects = oo_objects(oo);
+ slab->inuse = 0;
+ slab->frozen = 0;
account_slab(slab, oo_order(oo), s, flags);
@@ -2007,15 +1966,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, p, NULL);
}
- slab->inuse = slab->objects;
- slab->frozen = 1;
-
-out:
- if (!slab)
- return NULL;
-
- inc_slabs_node(s, slab_nid(slab), slab->objects);
-
return slab;
}
@@ -2103,6 +2053,75 @@ static inline void remove_partial(struct kmem_cache_node *n,
}
/*
+ * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
+ * slab from the n->partial list. Remove only a single object from the slab, do
+ * the alloc_debug_processing() checks and leave the slab on the list, or move
+ * it to full list if it was the last free object.
+ */
+static void *alloc_single_from_partial(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct slab *slab, int orig_size)
+{
+ void *object;
+
+ lockdep_assert_held(&n->list_lock);
+
+ object = slab->freelist;
+ slab->freelist = get_freepointer(s, object);
+ slab->inuse++;
+
+ if (!alloc_debug_processing(s, slab, object, orig_size)) {
+ remove_partial(n, slab);
+ return NULL;
+ }
+
+ if (slab->inuse == slab->objects) {
+ remove_partial(n, slab);
+ add_full(s, n, slab);
+ }
+
+ return object;
+}
+
+/*
+ * Called only for kmem_cache_debug() caches to allocate from a freshly
+ * allocated slab. Allocate a single object instead of whole freelist
+ * and put the slab to the partial (or full) list.
+ */
+static void *alloc_single_from_new_slab(struct kmem_cache *s,
+ struct slab *slab, int orig_size)
+{
+ int nid = slab_nid(slab);
+ struct kmem_cache_node *n = get_node(s, nid);
+ unsigned long flags;
+ void *object;
+
+
+ object = slab->freelist;
+ slab->freelist = get_freepointer(s, object);
+ slab->inuse = 1;
+
+ if (!alloc_debug_processing(s, slab, object, orig_size))
+ /*
+ * It's not really expected that this would fail on a
+ * freshly allocated slab, but a concurrent memory
+ * corruption in theory could cause that.
+ */
+ return NULL;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ if (slab->inuse == slab->objects)
+ add_full(s, n, slab);
+ else
+ add_partial(n, slab, DEACTIVATE_TO_HEAD);
+
+ inc_slabs_node(s, nid, slab->objects);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ return object;
+}
+
+/*
* Remove slab from the partial list, freeze it and
* return the pointer to the freelist.
*
@@ -2159,7 +2178,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
* Try to allocate a partial slab from a specific node.
*/
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
- struct slab **ret_slab, gfp_t gfpflags)
+ struct partial_context *pc)
{
struct slab *slab, *slab2;
void *object = NULL;
@@ -2179,15 +2198,23 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
void *t;
- if (!pfmemalloc_match(slab, gfpflags))
+ if (!pfmemalloc_match(slab, pc->flags))
+ continue;
+
+ if (kmem_cache_debug(s)) {
+ object = alloc_single_from_partial(s, n, slab,
+ pc->orig_size);
+ if (object)
+ break;
continue;
+ }
t = acquire_slab(s, n, slab, object == NULL);
if (!t)
break;
if (!object) {
- *ret_slab = slab;
+ *pc->slab = slab;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
@@ -2211,14 +2238,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
/*
* Get a slab from somewhere. Search in increasing NUMA distances.
*/
-static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
- struct slab **ret_slab)
+static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type highest_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(pc->flags);
void *object;
unsigned int cpuset_mems_cookie;
@@ -2246,15 +2272,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(mempolicy_slab_node(), flags);
+ zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
struct kmem_cache_node *n;
n = get_node(s, zone_to_nid(zone));
- if (n && cpuset_zone_allowed(zone, flags) &&
+ if (n && cpuset_zone_allowed(zone, pc->flags) &&
n->nr_partial > s->min_partial) {
- object = get_partial_node(s, n, ret_slab, flags);
+ object = get_partial_node(s, n, pc);
if (object) {
/*
* Don't check read_mems_allowed_retry()
@@ -2275,8 +2301,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
/*
* Get a partial slab, lock it and return it.
*/
-static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
- struct slab **ret_slab)
+static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc)
{
void *object;
int searchnode = node;
@@ -2284,11 +2309,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
+ object = get_partial_node(s, get_node(s, searchnode), pc);
if (object || node != NUMA_NO_NODE)
return object;
- return get_any_partial(s, flags, ret_slab);
+ return get_any_partial(s, pc);
}
#ifdef CONFIG_PREEMPTION
@@ -2730,7 +2755,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s)
INIT_WORK(&sfw->work, flush_cpu_slab);
sfw->skip = false;
sfw->s = s;
- schedule_work_on(cpu, &sfw->work);
+ queue_work_on(cpu, flushwq, &sfw->work);
}
for_each_online_cpu(cpu) {
@@ -2788,6 +2813,113 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
{
return atomic_long_read(&n->total_objects);
}
+
+/* Supports checking bulk free of a constructed freelist */
+static noinline void free_debug_processing(
+ struct kmem_cache *s, struct slab *slab,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
+{
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+ struct slab *slab_free = NULL;
+ void *object = head;
+ int cnt = 0;
+ unsigned long flags;
+ bool checks_ok = false;
+ depot_stack_handle_t handle = 0;
+
+ if (s->flags & SLAB_STORE_USER)
+ handle = set_track_prepare();
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, slab))
+ goto out;
+ }
+
+ if (slab->inuse < bulk_cnt) {
+ slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
+ slab->inuse, bulk_cnt);
+ goto out;
+ }
+
+next_object:
+
+ if (++cnt > bulk_cnt)
+ goto out_cnt;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, slab, object, addr))
+ goto out;
+ }
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track_update(s, object, TRACK_FREE, addr, handle);
+ trace(s, slab, object, 0);
+ /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
+ init_object(s, object, SLUB_RED_INACTIVE);
+
+ /* Reached end of constructed freelist yet? */
+ if (object != tail) {
+ object = get_freepointer(s, object);
+ goto next_object;
+ }
+ checks_ok = true;
+
+out_cnt:
+ if (cnt != bulk_cnt)
+ slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
+ bulk_cnt, cnt);
+
+out:
+ if (checks_ok) {
+ void *prior = slab->freelist;
+
+ /* Perform the actual freeing while we still hold the locks */
+ slab->inuse -= cnt;
+ set_freepointer(s, tail, prior);
+ slab->freelist = head;
+
+ /*
+ * If the slab is empty, and node's partial list is full,
+ * it should be discarded anyway no matter it's on full or
+ * partial list.
+ */
+ if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
+ slab_free = slab;
+
+ if (!prior) {
+ /* was on full list */
+ remove_full(s, n, slab);
+ if (!slab_free) {
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ } else if (slab_free) {
+ remove_partial(n, slab);
+ stat(s, FREE_REMOVE_PARTIAL);
+ }
+ }
+
+ if (slab_free) {
+ /*
+ * Update the counters while still holding n->list_lock to
+ * prevent spurious validation warnings
+ */
+ dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ if (!checks_ok)
+ slab_fix(s, "Object at 0x%p not freed", object);
+
+ if (slab_free) {
+ stat(s, FREE_SLAB);
+ free_slab(s, slab_free);
+ }
+}
#endif /* CONFIG_SLUB_DEBUG */
#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
@@ -2905,11 +3037,12 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
* already disabled (which is the case for bulk allocation).
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *freelist;
struct slab *slab;
unsigned long flags;
+ struct partial_context pc;
stat(s, ALLOC_SLOWPATH);
@@ -3023,7 +3156,10 @@ new_slab:
new_objects:
- freelist = get_partial(s, gfpflags, node, &slab);
+ pc.flags = gfpflags;
+ pc.slab = &slab;
+ pc.orig_size = orig_size;
+ freelist = get_partial(s, node, &pc);
if (freelist)
goto check_new_slab;
@@ -3036,36 +3172,53 @@ new_objects:
return NULL;
}
+ stat(s, ALLOC_SLAB);
+
+ if (kmem_cache_debug(s)) {
+ freelist = alloc_single_from_new_slab(s, slab, orig_size);
+
+ if (unlikely(!freelist))
+ goto new_objects;
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+
+ return freelist;
+ }
+
/*
* No other reference to the slab yet so we can
* muck around with it freely without cmpxchg
*/
freelist = slab->freelist;
slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab->frozen = 1;
- stat(s, ALLOC_SLAB);
+ inc_slabs_node(s, slab_nid(slab), slab->objects);
check_new_slab:
if (kmem_cache_debug(s)) {
- if (!alloc_debug_processing(s, slab, freelist, addr)) {
- /* Slab failed checks. Next slab needed */
- goto new_slab;
- } else {
- /*
- * For debug case, we don't load freelist so that all
- * allocations go through alloc_debug_processing()
- */
- goto return_single;
- }
+ /*
+ * For debug caches here we had to go through
+ * alloc_single_from_partial() so just store the tracking info
+ * and return the object
+ */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+
+ return freelist;
}
- if (unlikely(!pfmemalloc_match(slab, gfpflags)))
+ if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
/*
* For !pfmemalloc_match() case we don't load freelist so that
* we don't make further mismatched allocations easier.
*/
- goto return_single;
+ deactivate_slab(s, slab, get_freepointer(s, freelist));
+ return freelist;
+ }
retry_load_slab:
@@ -3089,11 +3242,6 @@ retry_load_slab:
c->slab = slab;
goto load_freelist;
-
-return_single:
-
- deactivate_slab(s, slab, get_freepointer(s, freelist));
- return freelist;
}
/*
@@ -3102,7 +3250,7 @@ return_single:
* pointer.
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *p;
@@ -3115,7 +3263,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
c = slub_get_cpu_ptr(s->cpu_slab);
#endif
- p = ___slab_alloc(s, gfpflags, node, addr, c);
+ p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
#ifdef CONFIG_PREEMPT_COUNT
slub_put_cpu_ptr(s->cpu_slab);
#endif
@@ -3197,16 +3345,10 @@ redo:
object = c->freelist;
slab = c->slab;
- /*
- * We cannot use the lockless fastpath on PREEMPT_RT because if a
- * slowpath has taken the local_lock_irqsave(), it is not protected
- * against a fast path operation in an irq handler. So we need to take
- * the slow path which uses local_lock. It is still relatively fast if
- * there is a suitable cpu freelist.
- */
- if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
+
+ if (!USE_LOCKLESS_FAST_PATH() ||
unlikely(!object || !slab || !node_match(slab, node))) {
- object = __slab_alloc(s, gfpflags, node, addr, c);
+ object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3257,8 +3399,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
{
void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size,
- s->size, gfpflags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
return ret;
}
@@ -3276,46 +3417,24 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
}
EXPORT_SYMBOL(kmem_cache_alloc_lru);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t orig_size,
+ unsigned long caller)
{
- void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size);
- trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags);
- ret = kasan_kmalloc(s, ret, size, gfpflags);
- return ret;
+ return slab_alloc_node(s, NULL, gfpflags, node,
+ caller, orig_size);
}
-EXPORT_SYMBOL(kmem_cache_alloc_trace);
-#endif
-#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc_node(_RET_IP_, ret, s,
- s->object_size, s->size, gfpflags, node);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
- gfp_t gfpflags,
- int node, size_t size)
-{
- void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
-
- trace_kmalloc_node(_RET_IP_, ret, s,
- size, s->size, gfpflags, node);
-
- ret = kasan_kmalloc(s, ret, size, gfpflags);
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
-#endif
-#endif /* CONFIG_NUMA */
-
/*
* Slow path handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
@@ -3341,9 +3460,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
if (kfence_free(head))
return;
- if (kmem_cache_debug(s) &&
- !free_debug_processing(s, slab, head, tail, cnt, addr))
+ if (kmem_cache_debug(s)) {
+ free_debug_processing(s, slab, head, tail, cnt, addr);
return;
+ }
do {
if (unlikely(n)) {
@@ -3463,6 +3583,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
+ void **freelist;
redo:
/*
@@ -3477,9 +3598,13 @@ redo:
/* Same with comment on barrier() in slab_alloc_node() */
barrier();
- if (likely(slab == c->slab)) {
-#ifndef CONFIG_PREEMPT_RT
- void **freelist = READ_ONCE(c->freelist);
+ if (unlikely(slab != c->slab)) {
+ __slab_free(s, slab, head, tail_obj, cnt, addr);
+ return;
+ }
+
+ if (USE_LOCKLESS_FAST_PATH()) {
+ freelist = READ_ONCE(c->freelist);
set_freepointer(s, tail_obj, freelist);
@@ -3491,16 +3616,8 @@ redo:
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
-#else /* CONFIG_PREEMPT_RT */
- /*
- * We cannot use the lockless fastpath on PREEMPT_RT because if
- * a slowpath has taken the local_lock_irqsave(), it is not
- * protected against a fast path operation in an irq handler. So
- * we need to take the local_lock. We shouldn't simply defer to
- * __slab_free() as that wouldn't use the cpu freelist at all.
- */
- void **freelist;
-
+ } else {
+ /* Update the free list under the local lock */
local_lock(&s->cpu_slab->lock);
c = this_cpu_ptr(s->cpu_slab);
if (unlikely(slab != c->slab)) {
@@ -3515,11 +3632,8 @@ redo:
c->tid = next_tid(tid);
local_unlock(&s->cpu_slab->lock);
-#endif
- stat(s, FREE_FASTPATH);
- } else
- __slab_free(s, slab, head, tail_obj, cnt, addr);
-
+ }
+ stat(s, FREE_FASTPATH);
}
static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
@@ -3542,12 +3656,17 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
}
#endif
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller)
+{
+ slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller);
+}
+
void kmem_cache_free(struct kmem_cache *s, void *x)
{
s = cache_from_obj(s, x);
if (!s)
return;
- trace_kmem_cache_free(_RET_IP_, x, s->name);
+ trace_kmem_cache_free(_RET_IP_, x, s);
slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3560,19 +3679,6 @@ struct detached_freelist {
struct kmem_cache *s;
};
-static inline void free_large_kmalloc(struct folio *folio, void *object)
-{
- unsigned int order = folio_order(folio);
-
- if (WARN_ON_ONCE(order == 0))
- pr_warn_once("object pointer: 0x%p\n", object);
-
- kfree_hook(object);
- mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
- __free_pages(folio_page(folio, 0), order);
-}
-
/*
* This function progressively scans the array with free objects (with
* a limited look ahead) and extract objects belonging to the same
@@ -3709,7 +3815,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
* of re-populating per CPU c->freelist
*/
p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
- _RET_IP_, c);
+ _RET_IP_, c, s->object_size);
if (unlikely(!p[i]))
goto error;
@@ -3936,6 +4042,7 @@ static void early_kmem_cache_node_alloc(int node)
slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
BUG_ON(!slab);
+ inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects);
if (slab_nid(slab) != node) {
pr_err("SLUB: Unable to allocate memory from node %d\n", node);
pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
@@ -3950,7 +4057,6 @@ static void early_kmem_cache_node_alloc(int node)
n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
slab->freelist = get_freepointer(kmem_cache_node, n);
slab->inuse = 1;
- slab->frozen = 0;
kmem_cache_node->node[node] = n;
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, slab->objects);
@@ -4112,12 +4218,17 @@ static int calculate_sizes(struct kmem_cache *s)
}
#ifdef CONFIG_SLUB_DEBUG
- if (flags & SLAB_STORE_USER)
+ if (flags & SLAB_STORE_USER) {
/*
* Need to store information about allocs and frees after
* the object.
*/
size += 2 * sizeof(struct track);
+
+ /* Save the original kmalloc request size */
+ if (flags & SLAB_KMALLOC)
+ size += sizeof(unsigned int);
+ }
#endif
kasan_cache_create(s, &size, &s->flags);
@@ -4237,23 +4348,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = slab_address(slab);
- unsigned long flags;
- unsigned long *map;
void *p;
slab_err(s, slab, text, s->name);
- slab_lock(slab, &flags);
- map = get_map(s, slab);
+ spin_lock(&object_map_lock);
+ __fill_map(object_map, s, slab);
+
for_each_object(p, s, addr, slab->objects) {
- if (!test_bit(__obj_to_index(s, addr, p), map)) {
+ if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
- put_map(map);
- slab_unlock(slab, &flags);
+ spin_unlock(&object_map_lock);
#endif
}
@@ -4404,78 +4513,6 @@ static int __init setup_slub_min_objects(char *str)
__setup("slub_min_objects=", setup_slub_min_objects);
-void *__kmalloc(size_t size, gfp_t flags)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, flags);
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, NULL, flags, _RET_IP_, size);
-
- trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc);
-
-#ifdef CONFIG_NUMA
-static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
-{
- struct page *page;
- void *ptr = NULL;
- unsigned int order = get_order(size);
-
- flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, order);
- if (page) {
- ptr = page_address(page);
- mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
- }
-
- return kmalloc_large_node_hook(ptr, size, flags);
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, flags, node);
-
- trace_kmalloc_node(_RET_IP_, ret, NULL,
- size, PAGE_SIZE << get_order(size),
- flags, node);
-
- return ret;
- }
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size);
-
- trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_HARDENED_USERCOPY
/*
* Rejects incorrectly sized objects and objects that are to be copied
@@ -4526,43 +4563,6 @@ void __check_heap_object(const void *ptr, unsigned long n,
}
#endif /* CONFIG_HARDENED_USERCOPY */
-size_t __ksize(const void *object)
-{
- struct folio *folio;
-
- if (unlikely(object == ZERO_SIZE_PTR))
- return 0;
-
- folio = virt_to_folio(object);
-
- if (unlikely(!folio_test_slab(folio)))
- return folio_size(folio);
-
- return slab_ksize(folio_slab(folio)->slab_cache);
-}
-EXPORT_SYMBOL(__ksize);
-
-void kfree(const void *x)
-{
- struct folio *folio;
- struct slab *slab;
- void *object = (void *)x;
-
- trace_kfree(_RET_IP_, x);
-
- if (unlikely(ZERO_OR_NULL_PTR(x)))
- return;
-
- folio = virt_to_folio(x);
- if (unlikely(!folio_test_slab(folio))) {
- free_large_kmalloc(folio, object);
- return;
- }
- slab = folio_slab(folio);
- slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_);
-}
-EXPORT_SYMBOL(kfree);
-
#define SHRINK_PROMOTE_MAX 32
/*
@@ -4611,6 +4611,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
if (free == slab->objects) {
list_move(&slab->slab_list, &discard);
n->nr_partial--;
+ dec_slabs_node(s, node, slab->objects);
} else if (free <= SHRINK_PROMOTE_MAX)
list_move(&slab->slab_list, promote + free - 1);
}
@@ -4626,7 +4627,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
/* Release empty slabs */
list_for_each_entry_safe(slab, t, &discard, slab_list)
- discard_slab(s, slab);
+ free_slab(s, slab);
if (slabs_node(s, node))
ret = 1;
@@ -4858,6 +4859,8 @@ void __init kmem_cache_init(void)
void __init kmem_cache_init_late(void)
{
+ flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!flushwq);
}
struct kmem_cache *
@@ -4908,60 +4911,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
return 0;
}
-void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, gfpflags);
-
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, NULL, gfpflags, caller, size);
-
- /* Honor the call site pointer we received. */
- trace_kmalloc(caller, ret, s, size, s->size, gfpflags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
-void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
- int node, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, gfpflags, node);
-
- trace_kmalloc_node(caller, ret, NULL,
- size, PAGE_SIZE << get_order(size),
- gfpflags, node);
-
- return ret;
- }
-
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size);
-
- /* Honor the call site pointer we received. */
- trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif
-
#ifdef CONFIG_SYSFS
static int count_inuse(struct slab *slab)
{
@@ -4980,12 +4929,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
{
void *p;
void *addr = slab_address(slab);
- unsigned long flags;
-
- slab_lock(slab, &flags);
if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
- goto unlock;
+ return;
/* Now we know that a valid freelist exists */
__fill_map(obj_map, s, slab);
@@ -4996,8 +4942,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
if (!check_object(s, slab, p, val))
break;
}
-unlock:
- slab_unlock(slab, &flags);
}
static int validate_slab_node(struct kmem_cache *s,
@@ -5068,6 +5012,7 @@ struct location {
depot_stack_handle_t handle;
unsigned long count;
unsigned long addr;
+ unsigned long waste;
long long sum_time;
long min_time;
long max_time;
@@ -5114,13 +5059,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
}
static int add_location(struct loc_track *t, struct kmem_cache *s,
- const struct track *track)
+ const struct track *track,
+ unsigned int orig_size)
{
long start, end, pos;
struct location *l;
- unsigned long caddr, chandle;
+ unsigned long caddr, chandle, cwaste;
unsigned long age = jiffies - track->when;
depot_stack_handle_t handle = 0;
+ unsigned int waste = s->object_size - orig_size;
#ifdef CONFIG_STACKDEPOT
handle = READ_ONCE(track->handle);
@@ -5138,11 +5085,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
if (pos == end)
break;
- caddr = t->loc[pos].addr;
- chandle = t->loc[pos].handle;
- if ((track->addr == caddr) && (handle == chandle)) {
+ l = &t->loc[pos];
+ caddr = l->addr;
+ chandle = l->handle;
+ cwaste = l->waste;
+ if ((track->addr == caddr) && (handle == chandle) &&
+ (waste == cwaste)) {
- l = &t->loc[pos];
l->count++;
if (track->when) {
l->sum_time += age;
@@ -5167,6 +5116,9 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
end = pos;
else if (track->addr == caddr && handle < chandle)
end = pos;
+ else if (track->addr == caddr && handle == chandle &&
+ waste < cwaste)
+ end = pos;
else
start = pos;
}
@@ -5190,6 +5142,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
l->min_pid = track->pid;
l->max_pid = track->pid;
l->handle = handle;
+ l->waste = waste;
cpumask_clear(to_cpumask(l->cpus));
cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
nodes_clear(l->nodes);
@@ -5202,13 +5155,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
unsigned long *obj_map)
{
void *addr = slab_address(slab);
+ bool is_alloc = (alloc == TRACK_ALLOC);
void *p;
__fill_map(obj_map, s, slab);
for_each_object(p, s, addr, slab->objects)
if (!test_bit(__obj_to_index(s, addr, p), obj_map))
- add_location(t, s, get_track(s, p, alloc));
+ add_location(t, s, get_track(s, p, alloc),
+ is_alloc ? get_orig_size(s, p) :
+ s->object_size);
}
#endif /* CONFIG_DEBUG_FS */
#endif /* CONFIG_SLUB_DEBUG */
@@ -5601,7 +5557,7 @@ static ssize_t validate_store(struct kmem_cache *s,
{
int ret = -EINVAL;
- if (buf[0] == '1') {
+ if (buf[0] == '1' && kmem_cache_debug(s)) {
ret = validate_slab_cache(s);
if (ret >= 0)
ret = length;
@@ -5826,7 +5782,6 @@ static ssize_t slab_attr_show(struct kobject *kobj,
{
struct slab_attribute *attribute;
struct kmem_cache *s;
- int err;
attribute = to_slab_attr(attr);
s = to_slab(kobj);
@@ -5834,9 +5789,7 @@ static ssize_t slab_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
- err = attribute->show(s, buf);
-
- return err;
+ return attribute->show(s, buf);
}
static ssize_t slab_attr_store(struct kobject *kobj,
@@ -5845,7 +5798,6 @@ static ssize_t slab_attr_store(struct kobject *kobj,
{
struct slab_attribute *attribute;
struct kmem_cache *s;
- int err;
attribute = to_slab_attr(attr);
s = to_slab(kobj);
@@ -5853,8 +5805,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
- err = attribute->store(s, buf, len);
- return err;
+ return attribute->store(s, buf, len);
}
static void kmem_cache_release(struct kobject *k)
@@ -5879,7 +5830,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
return slab_kset;
}
-#define ID_STR_LENGTH 64
+#define ID_STR_LENGTH 32
/* Create a unique string id for a slab cache:
*
@@ -5890,7 +5841,8 @@ static char *create_unique_id(struct kmem_cache *s)
char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
char *p = name;
- BUG_ON(!name);
+ if (!name)
+ return ERR_PTR(-ENOMEM);
*p++ = ':';
/*
@@ -5912,9 +5864,12 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'A';
if (p != name + 1)
*p++ = '-';
- p += sprintf(p, "%07u", s->size);
+ p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
- BUG_ON(p > name + ID_STR_LENGTH - 1);
+ if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
+ kfree(name);
+ return ERR_PTR(-EINVAL);
+ }
return name;
}
@@ -5948,6 +5903,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
* for the symlinks.
*/
name = create_unique_id(s);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
}
s->kobj.kset = kset;
@@ -6078,6 +6035,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v)
else
seq_puts(seq, "<not-available>");
+ if (l->waste)
+ seq_printf(seq, " waste=%lu/%lu",
+ l->count * l->waste, l->waste);
+
if (l->sum_time != l->min_time) {
seq_printf(seq, " age=%ld/%llu/%ld",
l->min_time, div_u64(l->sum_time, l->count),
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e166051566f4..41afa6d45b23 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -151,7 +151,7 @@ void __delete_from_swap_cache(struct folio *folio,
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_FOLIO(entry != folio, folio);
+ VM_BUG_ON_PAGE(entry != folio, entry);
set_page_private(folio_page(folio, i), 0);
xas_next(&xas);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1fdccd2f1422..82e62007881d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3655,7 +3655,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
avail_lists[nid]) {
if (si->bdev) {
- blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
+ blkcg_schedule_throttle(si->bdev->bd_disk, true);
break;
}
}
diff --git a/mm/util.c b/mm/util.c
index c9439c66d8cf..346e40177bc6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -619,6 +619,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
/* Don't even allow crazy sizes */
if (unlikely(size > INT_MAX)) {
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2b1431352dc..382dbe97329f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2550,8 +2550,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (unlikely(buffer_heads_over_limit)) {
- if (folio_get_private(folio) && folio_trylock(folio)) {
- if (folio_get_private(folio))
+ if (folio_test_private(folio) && folio_trylock(folio)) {
+ if (folio_test_private(folio))
filemap_release_folio(folio, 0);
folio_unlock(folio);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 90af9a8572f5..30686e6b4145 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
* CPU migrations and preemption potentially corrupts a counter so
* disable preemption.
*/
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
x = delta + __this_cpu_read(*p);
@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
}
__this_cpu_write(*p, x);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
x = delta + __this_cpu_read(*p);
@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
__this_cpu_write(*p, x);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, -overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, -overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -1247,6 +1235,7 @@ const char * const vmstat_text[] = {
"nr_shadow_call_stack",
#endif
"nr_page_table_pages",
+ "nr_sec_page_table_pages",
#ifdef CONFIG_SWAP
"nr_swapcached",
#endif