diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/compaction.c | 6 | ||||
-rw-r--r-- | mm/damon/sysfs.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 7 | ||||
-rw-r--r-- | mm/kfence/core.c | 2 | ||||
-rw-r--r-- | mm/kfence/report.c | 1 | ||||
-rw-r--r-- | mm/khugepaged.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 24 | ||||
-rw-r--r-- | mm/page-writeback.c | 1 | ||||
-rw-r--r-- | mm/page_alloc.c | 6 | ||||
-rw-r--r-- | mm/pagewalk.c | 21 | ||||
-rw-r--r-- | mm/ptdump.c | 4 | ||||
-rw-r--r-- | mm/readahead.c | 22 | ||||
-rw-r--r-- | mm/rmap.c | 29 | ||||
-rw-r--r-- | mm/slab.c | 305 | ||||
-rw-r--r-- | mm/slab.h | 10 | ||||
-rw-r--r-- | mm/slab_common.c | 287 | ||||
-rw-r--r-- | mm/slob.c | 45 | ||||
-rw-r--r-- | mm/slub.c | 883 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/util.c | 4 | ||||
-rw-r--r-- | mm/vmstat.c | 37 |
22 files changed, 863 insertions, 845 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ceec438c0741..57e1d8c5b505 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -573,6 +573,12 @@ config COMPACTION it and then we would be really interested to hear about that at linux-mm@kvack.org. +config COMPACT_UNEVICTABLE_DEFAULT + int + depends on COMPACTION + default 0 if PREEMPT_RT + default 1 + # # support for free page reporting config PAGE_REPORTING diff --git a/mm/compaction.c b/mm/compaction.c index e2a9615f5fde..2dd02c4683c4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1725,11 +1725,7 @@ typedef enum { * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. */ -#ifdef CONFIG_PREEMPT_RT -int sysctl_compact_unevictable_allowed __read_mostly = 0; -#else -int sysctl_compact_unevictable_allowed __read_mostly = 1; -#endif +int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 455215a5c059..9f1219a67e3f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2172,12 +2172,12 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, if (!t) return -ENOMEM; + damon_add_target(ctx, t); if (damon_target_has_pid(ctx)) { t->pid = find_get_pid(sys_target->pid); if (!t->pid) goto destroy_targets_out; } - damon_add_target(ctx, t); err = damon_sysfs_set_regions(t, sys_target->regions); if (err) goto destroy_targets_out; diff --git a/mm/filemap.c b/mm/filemap.c index ec17bd1a3bb7..08341616ae7a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2390,6 +2390,8 @@ retry: static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { + bool workingset = folio_test_workingset(folio); + unsigned long pflags; int error; /* @@ -2398,8 +2400,13 @@ static int filemap_read_folio(struct file *file, filler_t filler, * fails. */ folio_clear_error(folio); + /* Start the actual read. The read will unlock the page. */ + if (unlikely(workingset)) + psi_memstall_enter(&pflags); error = filler(file, folio); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); if (error) return error; diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 26de62a51665..141788858b70 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -853,7 +853,7 @@ static void kfence_init_enable(void) void __init kfence_init(void) { - stack_hash_seed = (u32)random_get_entropy(); + stack_hash_seed = get_random_u32(); /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ if (!kfence_sample_interval) diff --git a/mm/kfence/report.c b/mm/kfence/report.c index f5a6d8ba3e21..7e496856c2eb 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -86,6 +86,7 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries /* Also the *_bulk() variants by only checking prefixes. */ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") || str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) goto found; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c7699fabf302..4734315f7940 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -727,8 +727,8 @@ static void khugepaged_alloc_sleep(void) DEFINE_WAIT(wait); add_wait_queue(&khugepaged_wait, &wait); - freezable_schedule_timeout_interruptible( - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 61e05fc281fb..2d8549ae1b30 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -590,25 +590,18 @@ static u64 flush_next_time; */ static void memcg_stats_lock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_disable(); -#else - VM_BUG_ON(!irqs_disabled()); -#endif + preempt_disable_nested(); + VM_WARN_ON_IRQS_ENABLED(); } static void __memcg_stats_lock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_disable(); -#endif + preempt_disable_nested(); } static void memcg_stats_unlock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_enable(); -#endif + preempt_enable_nested(); } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) @@ -783,7 +776,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, * interrupt context while other caller need to have disabled interrupt. */ __memcg_stats_lock(); - if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (IS_ENABLED(CONFIG_DEBUG_VM)) { switch (idx) { case NR_ANON_MAPPED: case NR_FILE_MAPPED: @@ -793,7 +786,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, WARN_ON_ONCE(!in_task()); break; default: - WARN_ON_ONCE(!irqs_disabled()); + VM_WARN_ON_IRQS_ENABLED(); } } @@ -1479,6 +1472,7 @@ static const struct memory_stat memory_stats[] = { { "kernel", MEMCG_KMEM }, { "kernel_stack", NR_KERNEL_STACK_KB }, { "pagetables", NR_PAGETABLE }, + { "sec_pagetables", NR_SECONDARY_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, { "vmalloc", MEMCG_VMALLOC }, @@ -5176,8 +5170,8 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) struct mem_cgroup *memcg; cgrp = cgroup_get_from_id(ino); - if (!cgrp) - return ERR_PTR(-ENOENT); + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); if (css) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 032a7bf8d259..7e9d8d857ecc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1933,6 +1933,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, wb_put(wb); return ret; } +EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags); /** * balance_dirty_pages_ratelimited - balance dirty memory state. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12b6184cbbed..ac2c9f12a7b2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6102,7 +6102,8 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu\n" + " sec_pagetables:%lu bounce:%lu\n" " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), @@ -6119,6 +6120,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), + global_node_page_state(NR_SECONDARY_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), @@ -6154,6 +6156,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i " shadow_call_stack:%lukB" #endif " pagetables:%lukB" + " sec_pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -6179,6 +6182,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif K(node_page_state(pgdat, NR_PAGETABLE)), + K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 131b2b335b2c..2ff3a5bebceb 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -110,7 +110,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, do { again: next = pmd_addr_end(addr, end); - if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) { + if (pmd_none(*pmd)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) @@ -171,7 +171,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, do { again: next = pud_addr_end(addr, end); - if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) { + if (pud_none(*pud)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) @@ -366,19 +366,19 @@ static int __walk_page_range(unsigned long start, unsigned long end, struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; - if (vma && ops->pre_vma) { + if (ops->pre_vma) { err = ops->pre_vma(start, end, walk); if (err) return err; } - if (vma && is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma)) { if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); - if (vma && ops->post_vma) + if (ops->post_vma) ops->post_vma(walk); return err; @@ -450,9 +450,13 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (!vma) { /* after the last vma */ walk.vma = NULL; next = end; + if (ops->pte_hole) + err = ops->pte_hole(start, next, -1, &walk); } else if (start < vma->vm_start) { /* outside vma */ walk.vma = NULL; next = min(end, vma->vm_start); + if (ops->pte_hole) + err = ops->pte_hole(start, next, -1, &walk); } else { /* inside vma */ walk.vma = vma; next = min(end, vma->vm_end); @@ -470,9 +474,8 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } if (err < 0) break; - } - if (walk.vma || walk.ops->pte_hole) err = __walk_page_range(start, next, &walk); + } if (err) break; } while (start = next, start < end); @@ -509,9 +512,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, if (start >= end || !walk.mm) return -EINVAL; - mmap_assert_locked(walk.mm); + mmap_assert_write_locked(walk.mm); - return __walk_page_range(start, end, &walk); + return walk_pgd_range(start, end, &walk); } int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, diff --git a/mm/ptdump.c b/mm/ptdump.c index eea3d28d173c..8adab455a68b 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -152,13 +152,13 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) { const struct ptdump_range *range = st->range; - mmap_read_lock(mm); + mmap_write_lock(mm); while (range->start != range->end) { walk_page_range_novma(mm, range->start, range->end, &ptdump_ops, pgd, st); range++; } - mmap_read_unlock(mm); + mmap_write_unlock(mm); /* Flush out the last page */ st->note_page(st, 0, -1, 0); diff --git a/mm/readahead.c b/mm/readahead.c index fdcd28cbd92d..b10f0cf81d80 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -122,6 +122,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> #include <linux/pagemap.h> +#include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> @@ -152,6 +153,8 @@ static void read_pages(struct readahead_control *rac) if (!readahead_count(rac)) return; + if (unlikely(rac->_workingset)) + psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { @@ -179,6 +182,9 @@ static void read_pages(struct readahead_control *rac) } blk_finish_plug(&plug); + if (unlikely(rac->_workingset)) + psi_memstall_leave(&rac->_pflags); + rac->_workingset = false; BUG_ON(readahead_count(rac)); } @@ -252,6 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, } if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); + ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages++; } @@ -480,11 +487,14 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); - if (err) + if (err) { folio_put(folio); - else - ractl->_nr_pages += 1UL << order; - return err; + return err; + } + + ractl->_nr_pages += 1UL << order; + ractl->_workingset |= folio_test_workingset(folio); + return 0; } void page_cache_ra_order(struct readahead_control *ractl, @@ -826,6 +836,10 @@ void readahead_expand(struct readahead_control *ractl, put_page(page); return; } + if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { + ractl->_workingset = true; + psi_memstall_enter(&ractl->_pflags); + } ractl->_nr_pages++; if (ra) { ra->size++; diff --git a/mm/rmap.c b/mm/rmap.c index b6743c2b8b5f..2ec925e5fa6a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -93,7 +93,8 @@ static inline struct anon_vma *anon_vma_alloc(void) anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); - anon_vma->degree = 1; /* Reference for first vma */ + anon_vma->num_children = 0; + anon_vma->num_active_vmas = 0; anon_vma->parent = anon_vma; /* * Initialise the anon_vma root to point to itself. If called @@ -201,6 +202,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; + anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; } @@ -210,8 +212,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; + anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; } @@ -296,19 +297,19 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) anon_vma_chain_link(dst, avc, anon_vma); /* - * Reuse existing anon_vma if its degree lower than two, - * that means it has no vma and only one anon_vma child. + * Reuse existing anon_vma if it has no vma and only one + * anon_vma child. * - * Do not choose parent anon_vma, otherwise first child - * will always reuse it. Root anon_vma is never reused: + * Root anon_vma is never reused: * it has self-parent reference and at least one child. */ if (!dst->anon_vma && src->anon_vma && - anon_vma != src->anon_vma && anon_vma->degree < 2) + anon_vma->num_children < 2 && + anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma; } if (dst->anon_vma) - dst->anon_vma->degree++; + dst->anon_vma->num_active_vmas++; unlock_anon_vma_root(root); return 0; @@ -358,6 +359,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; + anon_vma->num_active_vmas++; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; @@ -378,7 +380,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); - anon_vma->parent->degree++; + anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; @@ -410,7 +412,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) * to free them outside the lock. */ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { - anon_vma->parent->degree--; + anon_vma->parent->num_children--; continue; } @@ -418,7 +420,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) anon_vma_chain_free(avc); } if (vma->anon_vma) { - vma->anon_vma->degree--; + vma->anon_vma->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared @@ -436,7 +438,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - VM_WARN_ON(anon_vma->degree); + VM_WARN_ON(anon_vma->num_children); + VM_WARN_ON(anon_vma->num_active_vmas); put_anon_vma(anon_vma); list_del(&avc->same_vma); diff --git a/mm/slab.c b/mm/slab.c index 10e96137b44f..a5486ff8362a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3181,84 +3181,46 @@ must_grow: } static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, - unsigned long caller) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - unsigned long save_flags; - void *ptr; + void *objp = NULL; int slab_node = numa_mem_id(); - struct obj_cgroup *objcg = NULL; - bool init = false; - flags &= gfp_allowed_mask; - cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags); - if (unlikely(!cachep)) - return NULL; - - ptr = kfence_alloc(cachep, orig_size, flags); - if (unlikely(ptr)) - goto out_hooks; - - local_irq_save(save_flags); - - if (nodeid == NUMA_NO_NODE) - nodeid = slab_node; - - if (unlikely(!get_node(cachep, nodeid))) { - /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); - goto out; - } - - if (nodeid == slab_node) { + if (nodeid == NUMA_NO_NODE) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { + objp = alternate_node_alloc(cachep, flags); + if (objp) + goto out; + } /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ - ptr = ____cache_alloc(cachep, flags); - if (ptr) - goto out; - } - /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); -out: - local_irq_restore(save_flags); - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - init = slab_want_init_on_alloc(flags, cachep); - -out_hooks: - slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init); - return ptr; -} - -static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) -{ - void *objp; - - if (current->mempolicy || cpuset_do_slab_mem_spread()) { - objp = alternate_node_alloc(cache, flags); - if (objp) - goto out; + objp = ____cache_alloc(cachep, flags); + nodeid = slab_node; + } else if (nodeid == slab_node) { + objp = ____cache_alloc(cachep, flags); + } else if (!get_node(cachep, nodeid)) { + /* Node not bootstrapped yet */ + objp = fallback_alloc(cachep, flags); + goto out; } - objp = ____cache_alloc(cache, flags); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_mem_id()); - + objp = ____cache_alloc_node(cachep, flags, nodeid); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused) { return ____cache_alloc(cachep, flags); } @@ -3266,8 +3228,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, - size_t orig_size, unsigned long caller) +slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + int nodeid, size_t orig_size, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3284,7 +3246,7 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, goto out; local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); + objp = __do_cache_alloc(cachep, flags, nodeid); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); @@ -3295,6 +3257,14 @@ out: return objp; } +static __always_inline void * +slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + size_t orig_size, unsigned long caller) +{ + return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size, + caller); +} + /* * Caller needs to acquire correct kmem_cache_node's list_lock * @list: List of detached free slabs should be freed by caller @@ -3470,8 +3440,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, flags); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE); return ret; } @@ -3521,7 +3490,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_disable(); for (i = 0; i < size; i++) { - void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags); + void *objp = kfence_alloc(s, s->object_size, flags) ?: + __do_cache_alloc(s, flags, NUMA_NO_NODE); if (unlikely(!objp)) goto error; @@ -3548,23 +3518,6 @@ error: } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -#ifdef CONFIG_TRACING -void * -kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) -{ - void *ret; - - ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(_RET_IP_, ret, cachep, - size, cachep->size, flags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif - -#ifdef CONFIG_NUMA /** * kmem_cache_alloc_node - Allocate an object on the specified node * @cachep: The cache to allocate from. @@ -3580,66 +3533,22 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_); + void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, - flags, nodeid); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, - gfp_t flags, - int nodeid, - size_t size) +void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, size_t orig_size, + unsigned long caller) { - void *ret; - - ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc_node(_RET_IP_, ret, cachep, - size, cachep->size, - flags, nodeid); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif - -static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) -{ - struct kmem_cache *cachep; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); - ret = kasan_kmalloc(cachep, ret, size, flags); - - return ret; + return slab_alloc_node(cachep, NULL, flags, nodeid, + orig_size, caller); } -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc_node); - -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) -{ - return __do_kmalloc_node(size, flags, node, caller); -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_PRINTK void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { @@ -3662,45 +3571,25 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) } #endif -/** - * __do_kmalloc - allocate memory - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate (see kmalloc). - * @caller: function caller for debug tracking of the caller - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, - unsigned long caller) +static __always_inline +void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { - struct kmem_cache *cachep; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - ret = slab_alloc(cachep, NULL, flags, size, caller); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(caller, ret, cachep, - size, cachep->size, flags); - - return ret; -} + unsigned long flags; -void *__kmalloc(size_t size, gfp_t flags) -{ - return __do_kmalloc(size, flags, _RET_IP_); + local_irq_save(flags); + debug_check_no_locks_freed(objp, cachep->object_size); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, cachep->object_size); + __cache_free(cachep, objp, caller); + local_irq_restore(flags); } -EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) +void __kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { - return __do_kmalloc(size, flags, caller); + __do_kmem_cache_free(cachep, objp, caller); } -EXPORT_SYMBOL(__kmalloc_track_caller); /** * kmem_cache_free - Deallocate an object @@ -3712,34 +3601,38 @@ EXPORT_SYMBOL(__kmalloc_track_caller); */ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { - unsigned long flags; cachep = cache_from_obj(cachep, objp); if (!cachep) return; - trace_kmem_cache_free(_RET_IP_, objp, cachep->name); - local_irq_save(flags); - debug_check_no_locks_freed(objp, cachep->object_size); - if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, cachep->object_size); - __cache_free(cachep, objp, _RET_IP_); - local_irq_restore(flags); + trace_kmem_cache_free(_RET_IP_, objp, cachep); + __do_kmem_cache_free(cachep, objp, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) { - struct kmem_cache *s; - size_t i; local_irq_disable(); - for (i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { void *objp = p[i]; + struct kmem_cache *s; - if (!orig_s) /* called via kfree_bulk */ - s = virt_to_cache(objp); - else + if (!orig_s) { + struct folio *folio = virt_to_folio(objp); + + /* called via kfree_bulk */ + if (!folio_test_slab(folio)) { + local_irq_enable(); + free_large_kmalloc(folio, objp); + local_irq_disable(); + continue; + } + s = folio_slab(folio)->slab_cache; + } else { s = cache_from_obj(orig_s, objp); + } + if (!s) continue; @@ -3755,39 +3648,6 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -/** - * kfree - free previously allocated memory - * @objp: pointer returned by kmalloc. - * - * If @objp is NULL, no operation is performed. - * - * Don't free memory not originally allocated by kmalloc() - * or you will run into trouble. - */ -void kfree(const void *objp) -{ - struct kmem_cache *c; - unsigned long flags; - - trace_kfree(_RET_IP_, objp); - - if (unlikely(ZERO_OR_NULL_PTR(objp))) - return; - local_irq_save(flags); - kfree_debugcheck(objp); - c = virt_to_cache(objp); - if (!c) { - local_irq_restore(flags); - return; - } - debug_check_no_locks_freed(objp, c->object_size); - - debug_check_no_obj_freed(objp, c->object_size); - __cache_free(c, (void *)objp, _RET_IP_); - local_irq_restore(flags); -} -EXPORT_SYMBOL(kfree); - /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ @@ -4190,28 +4050,3 @@ void __check_heap_object(const void *ptr, unsigned long n, usercopy_abort("SLAB object", cachep->name, to_user, offset, n); } #endif /* CONFIG_HARDENED_USERCOPY */ - -/** - * __ksize -- Uninstrumented ksize. - * @objp: pointer to the object - * - * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same - * safety checks as ksize() with KASAN instrumentation enabled. - * - * Return: size of the actual memory used by @objp in bytes - */ -size_t __ksize(const void *objp) -{ - struct kmem_cache *c; - size_t size; - - BUG_ON(!objp); - if (unlikely(objp == ZERO_SIZE_PTR)) - return 0; - - c = virt_to_cache(objp); - size = c ? c->object_size : 0; - - return size; -} -EXPORT_SYMBOL(__ksize); diff --git a/mm/slab.h b/mm/slab.h index 9d0afd2985df..0202a8c2f0d2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -273,6 +273,11 @@ void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ struct kmem_cache *kmalloc_slab(size_t, gfp_t); + +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller); +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); #endif gfp_t kmalloc_fix_flags(gfp_t flags); @@ -658,8 +663,13 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) print_tracking(cachep, x); return cachep; } + +void free_large_kmalloc(struct folio *folio, void *object); + #endif /* CONFIG_SLOB */ +size_t __ksize(const void *objp); + static inline size_t slab_ksize(const struct kmem_cache *s) { #ifndef CONFIG_SLUB diff --git a/mm/slab_common.c b/mm/slab_common.c index 17996649cfe3..33b1886b06eb 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -392,6 +392,28 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, } EXPORT_SYMBOL(kmem_cache_create); +#ifdef SLAB_SUPPORTS_SYSFS +/* + * For a given kmem_cache, kmem_cache_destroy() should only be called + * once or there will be a use-after-free problem. The actual deletion + * and release of the kobject does not need slab_mutex or cpu_hotplug_lock + * protection. So they are now done without holding those locks. + * + * Note that there will be a slight delay in the deletion of sysfs files + * if kmem_cache_release() is called indrectly from a work function. + */ +static void kmem_cache_release(struct kmem_cache *s) +{ + sysfs_slab_unlink(s); + sysfs_slab_release(s); +} +#else +static void kmem_cache_release(struct kmem_cache *s) +{ + slab_kmem_cache_release(s); +} +#endif + static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) { LIST_HEAD(to_destroy); @@ -418,11 +440,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) list_for_each_entry_safe(s, s2, &to_destroy, list) { debugfs_slab_release(s); kfence_shutdown_cache(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_release(s); -#else - slab_kmem_cache_release(s); -#endif + kmem_cache_release(s); } } @@ -437,20 +455,11 @@ static int shutdown_cache(struct kmem_cache *s) list_del(&s->list); if (s->flags & SLAB_TYPESAFE_BY_RCU) { -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_unlink(s); -#endif list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { kfence_shutdown_cache(s); debugfs_slab_release(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_unlink(s); - sysfs_slab_release(s); -#else - slab_kmem_cache_release(s); -#endif } return 0; @@ -465,14 +474,19 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { + int refcnt; + bool rcu_set; + if (unlikely(!s) || !kasan_check_byte(s)) return; cpus_read_lock(); mutex_lock(&slab_mutex); - s->refcount--; - if (s->refcount) + rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; + + refcnt = --s->refcount; + if (refcnt) goto out_unlock; WARN(shutdown_cache(s), @@ -481,6 +495,8 @@ void kmem_cache_destroy(struct kmem_cache *s) out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); + if (!refcnt && !rcu_set) + kmem_cache_release(s); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -495,13 +511,9 @@ EXPORT_SYMBOL(kmem_cache_destroy); */ int kmem_cache_shrink(struct kmem_cache *cachep) { - int ret; - - kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep); - return ret; + return __kmem_cache_shrink(cachep); } EXPORT_SYMBOL(kmem_cache_shrink); @@ -649,7 +661,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, if (!s) panic("Out of memory when creating slab %s\n", name); - create_boot_cache(s, name, size, flags, useroffset, usersize); + create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset, + usersize); kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; @@ -721,6 +734,26 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) return kmalloc_caches[kmalloc_type(flags)][index]; } +size_t kmalloc_size_roundup(size_t size) +{ + struct kmem_cache *c; + + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + /* Above the smaller buckets, size is a multiple of page size. */ + if (size > KMALLOC_MAX_CACHE_SIZE) + return PAGE_SIZE << get_order(size); + + /* The flags don't matter since size_index is common to all. */ + c = kmalloc_slab(size, GFP_KERNEL); + return c ? c->object_size : 0; +} +EXPORT_SYMBOL(kmalloc_size_roundup); + #ifdef CONFIG_ZONE_DMA #define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, #else @@ -744,8 +777,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. - * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is - * kmalloc-32M. + * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is + * kmalloc-2M. */ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(0, 0), @@ -769,11 +802,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(262144, 256k), INIT_KMALLOC_INFO(524288, 512k), INIT_KMALLOC_INFO(1048576, 1M), - INIT_KMALLOC_INFO(2097152, 2M), - INIT_KMALLOC_INFO(4194304, 4M), - INIT_KMALLOC_INFO(8388608, 8M), - INIT_KMALLOC_INFO(16777216, 16M), - INIT_KMALLOC_INFO(33554432, 32M) + INIT_KMALLOC_INFO(2097152, 2M) }; /* @@ -886,6 +915,156 @@ void __init create_kmalloc_caches(slab_flags_t flags) /* Kmalloc array is now usable */ slab_state = UP; } + +void free_large_kmalloc(struct folio *folio, void *object) +{ + unsigned int order = folio_order(folio); + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + kmsan_kfree_large(object); + + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); +} + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); +static __always_inline +void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node(size, flags, node); + trace_kmalloc(_RET_IP_, ret, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; + } + + s = kmalloc_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = __kmem_cache_alloc_node(s, flags, node, size, caller); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc(_RET_IP_, ret, size, s->size, flags, node); + return ret; +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc. + * + * If @object is NULL, no operation is performed. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, (void *)object); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + __kmem_cache_free(s, (void *)object, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + +/** + * __ksize -- Report full size of underlying allocation + * @objp: pointer to the object + * + * This should only be used internally to query the true size of allocations. + * It is not meant to be a way to discover the usable size of an allocation + * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond + * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, + * and/or FORTIFY_SOURCE. + * + * Return: size of the actual memory used by @objp in bytes + */ +size_t __ksize(const void *object) +{ + struct folio *folio; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + folio = virt_to_folio(object); + + if (unlikely(!folio_test_slab(folio))) { + if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) + return 0; + if (WARN_ON(object != folio_address(folio))) + return 0; + return folio_size(folio); + } + + return slab_ksize(folio_slab(folio)->slab_cache); +} + +#ifdef CONFIG_TRACING +void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, + size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_trace); + +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_node_trace); +#endif /* !CONFIG_TRACING */ #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) @@ -905,37 +1084,51 @@ gfp_t kmalloc_fix_flags(gfp_t flags) * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. */ -void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) { - void *ret = NULL; struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); if (unlikely(flags & GFP_SLAB_BUG_MASK)) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - page = alloc_pages(flags, order); - if (likely(page)) { - ret = page_address(page); + page = alloc_pages_node(node, flags, order); + if (page) { + ptr = page_address(page); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); } - ret = kasan_kmalloc_large(ret, size, flags); - /* As ret might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ret, size, 1, flags); + + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); + + return ptr; +} + +void *kmalloc_large(size_t size, gfp_t flags) +{ + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmalloc_order); +EXPORT_SYMBOL(kmalloc_large); -#ifdef CONFIG_TRACING -void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - void *ret = kmalloc_order(size, flags, order); - trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags); + void *ret = __kmalloc_large_node(size, flags, node); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); return ret; } -EXPORT_SYMBOL(kmalloc_order_trace); -#endif +EXPORT_SYMBOL(kmalloc_large_node); #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ @@ -1134,8 +1327,8 @@ module_init(slab_proc_init); #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; size_t ks; @@ -1267,8 +1460,6 @@ EXPORT_SYMBOL(ksize); /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/slob.c b/mm/slob.c index 2bd4f476c340..fe567fcfa3a3 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -507,8 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) *m = size; ret = (void *)m + minalign; - trace_kmalloc_node(caller, ret, NULL, - size, size + minalign, gfp, node); + trace_kmalloc(caller, ret, size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -516,8 +515,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << order, gfp, node); + trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node); } kmemleak_alloc(ret, size, 1, gfp); @@ -530,20 +528,12 @@ void *__kmalloc(size_t size, gfp_t gfp) } EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) -{ - return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); -} -EXPORT_SYMBOL(__kmalloc_track_caller); - -#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, int node, unsigned long caller) { return __do_kmalloc_node(size, gfp, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif void kfree(const void *block) { @@ -574,6 +564,20 @@ void kfree(const void *block) } EXPORT_SYMBOL(kfree); +size_t kmalloc_size_roundup(size_t size) +{ + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + + return ALIGN(size, ARCH_KMALLOC_MINALIGN); +} + +EXPORT_SYMBOL(kmalloc_size_roundup); + /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ size_t __ksize(const void *block) { @@ -594,7 +598,6 @@ size_t __ksize(const void *block) m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } -EXPORT_SYMBOL(__ksize); int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { @@ -602,6 +605,9 @@ int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } + + /* Actual size allocated */ + c->size = SLOB_UNITS(c->size) * SLOB_UNIT; c->flags = flags; return 0; } @@ -616,14 +622,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node, 0); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - SLOB_UNITS(c->size) * SLOB_UNIT, - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - PAGE_SIZE << get_order(c->size), - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } if (b && c->ctor) { @@ -647,7 +649,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_ return slob_alloc_node(cachep, flags, NUMA_NO_NODE); } EXPORT_SYMBOL(kmem_cache_alloc_lru); -#ifdef CONFIG_NUMA + void *__kmalloc_node(size_t size, gfp_t gfp, int node) { return __do_kmalloc_node(size, gfp, node, _RET_IP_); @@ -659,7 +661,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) return slob_alloc_node(cachep, gfp, node); } EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif static void __kmem_cache_free(void *b, int size) { @@ -680,7 +681,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { kmemleak_free_recursive(b, c->flags); - trace_kmem_cache_free(_RET_IP_, b, c->name); + trace_kmem_cache_free(_RET_IP_, b, c); if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); diff --git a/mm/slub.c b/mm/slub.c index ce8310e131b3..96dd392d7f99 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -51,7 +51,7 @@ * 1. slab_mutex (Global Mutex) * 2. node->list_lock (Spinlock) * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches or for debugging) + * 4. slab_lock(slab) (Only on some arches) * 5. object_map_lock (Only for debugging) * * slab_mutex @@ -65,8 +65,9 @@ * The slab_lock is a wrapper around the page lock, thus it is a bit * spinlock. * - * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects: + * The slab_lock is only used on arches that do not have the ability + * to do a cmpxchg_double. It only protects: + * * A. slab->freelist -> List of free objects in a slab * B. slab->inuse -> Number of objects in use * C. slab->objects -> Number of objects in slab @@ -95,15 +96,20 @@ * allocating a long series of objects that fill up slabs does not require * the list lock. * + * For debug caches, all allocations are forced to go through a list_lock + * protected region to serialize against concurrent validation. + * * cpu_slab->lock local lock * * This locks protect slowpath manipulation of all kmem_cache_cpu fields * except the stat counters. This is a percpu structure manipulated only by * the local cpu, so the lock protects against being preempted or interrupted * by an irq. Fast path operations rely on lockless operations instead. - * On PREEMPT_RT, the local lock does not actually disable irqs (and thus - * prevent the lockless operations), so fastpath operations also need to take - * the lock and are no longer lockless. + * + * On PREEMPT_RT, the local lock neither disables interrupts nor preemption + * which means the lockless fastpath cannot be used as it might interfere with + * an in-progress slow path operations. In this case the local lock is always + * taken but it still utilizes the freelist for the common operations. * * lockless fastpaths * @@ -164,8 +170,9 @@ * function call even on !PREEMPT_RT, use inline preempt_disable() there. */ #ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define USE_LOCKLESS_FAST_PATH() (true) #else #define slub_get_cpu_ptr(var) \ ({ \ @@ -177,6 +184,7 @@ do { \ (void)(var); \ migrate_enable(); \ } while (0) +#define USE_LOCKLESS_FAST_PATH() (false) #endif #ifdef CONFIG_SLUB_DEBUG @@ -187,11 +195,24 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif /* CONFIG_SLUB_DEBUG */ +/* Structure holding parameters for get_partial() call chain */ +struct partial_context { + struct slab **slab; + gfp_t flags; + unsigned int orig_size; +}; + static inline bool kmem_cache_debug(struct kmem_cache *s) { return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } +static inline bool slub_debug_orig_size(struct kmem_cache *s) +{ + return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && + (s->flags & SLAB_KMALLOC)); +} + void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) @@ -311,6 +332,11 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) */ static nodemask_t slab_nodes; +/* + * Workqueue used for flush_cpu_slab(). + */ +static struct workqueue_struct *flushwq; + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -454,7 +480,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) /* * Per slab locking using the pagelock */ -static __always_inline void __slab_lock(struct slab *slab) +static __always_inline void slab_lock(struct slab *slab) { struct page *page = slab_page(slab); @@ -462,7 +488,7 @@ static __always_inline void __slab_lock(struct slab *slab) bit_spin_lock(PG_locked, &page->flags); } -static __always_inline void __slab_unlock(struct slab *slab) +static __always_inline void slab_unlock(struct slab *slab) { struct page *page = slab_page(slab); @@ -470,31 +496,19 @@ static __always_inline void __slab_unlock(struct slab *slab) __bit_spin_unlock(PG_locked, &page->flags); } -static __always_inline void slab_lock(struct slab *slab, unsigned long *flags) -{ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_save(*flags); - __slab_lock(slab); -} - -static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags) -{ - __slab_unlock(slab); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(*flags); -} - /* * Interrupts must be disabled (for the fallback code to work right), typically - * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different - * so we disable interrupts as part of slab_[un]lock(). + * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is + * part of bit_spin_lock(), is sufficient because the policy is not to allow any + * allocation/ free operation in hardirq context. Therefore nothing can + * interrupt the operation. */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) @@ -506,18 +520,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab } else #endif { - /* init to 0 to prevent spurious warnings */ - unsigned long flags = 0; - - slab_lock(slab, &flags); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - slab_unlock(slab, &flags); + slab_unlock(slab); return true; } - slab_unlock(slab, &flags); + slab_unlock(slab); } cpu_relax(); @@ -548,16 +559,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, unsigned long flags; local_irq_save(flags); - __slab_lock(slab); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); return true; } - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); } @@ -573,7 +584,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_RAW_SPINLOCK(object_map_lock); +static DEFINE_SPINLOCK(object_map_lock); static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, struct slab *slab) @@ -607,30 +618,6 @@ static bool slab_add_kunit_errors(void) static inline bool slab_add_kunit_errors(void) { return false; } #endif -/* - * Determine a map of objects in use in a slab. - * - * Node listlock must be held to guarantee that the slab does - * not vanish from under us. - */ -static unsigned long *get_map(struct kmem_cache *s, struct slab *slab) - __acquires(&object_map_lock) -{ - VM_BUG_ON(!irqs_disabled()); - - raw_spin_lock(&object_map_lock); - - __fill_map(object_map, s, slab); - - return object_map; -} - -static void put_map(unsigned long *map) __releases(&object_map_lock) -{ - VM_BUG_ON(map != object_map); - raw_spin_unlock(&object_map_lock); -} - static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) @@ -828,6 +815,39 @@ static void print_slab_info(const struct slab *slab) folio_flags(folio, 0)); } +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + static void slab_bug(struct kmem_cache *s, char *fmt, ...) { struct va_format vaf; @@ -887,6 +907,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); + if (slub_debug_orig_size(s)) + off += sizeof(unsigned int); + off += kasan_metadata_size(s); if (off != size_from_object(s)) @@ -1020,7 +1043,8 @@ skip_bug_print: * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at minimum + * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) + * D. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -1038,10 +1062,14 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ - if (s->flags & SLAB_STORE_USER) + if (s->flags & SLAB_STORE_USER) { /* We also have user information there */ off += 2 * sizeof(struct track); + if (s->flags & SLAB_KMALLOC) + off += sizeof(unsigned int); + } + off += kasan_metadata_size(s); if (size_from_object(s) == off) @@ -1336,18 +1364,16 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, } static noinline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, - void *object, unsigned long addr) + struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!alloc_consistency_checks(s, slab, object)) goto bad; } - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); + /* Success. Perform special debug activities for allocs */ trace(s, slab, object, 1); + set_orig_size(s, object, orig_size); init_object(s, object, SLUB_RED_ACTIVE); return 1; @@ -1397,63 +1423,6 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 1; } -/* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - void *object = head; - int cnt = 0; - unsigned long flags, flags2; - int ret = 0; - depot_stack_handle_t handle = 0; - - if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); - - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(slab, &flags2); - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!check_slab(s, slab)) - goto out; - } - -next_object: - cnt++; - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!free_consistency_checks(s, slab, object, addr)) - goto out; - } - - if (s->flags & SLAB_STORE_USER) - set_track_update(s, object, TRACK_FREE, addr, handle); - trace(s, slab, object, 0); - /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ - init_object(s, object, SLUB_RED_INACTIVE); - - /* Reached end of constructed freelist yet? */ - if (object != tail) { - object = get_freepointer(s, object); - goto next_object; - } - ret = 1; - -out: - if (cnt != bulk_cnt) - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", - bulk_cnt, cnt); - - slab_unlock(slab, &flags2); - spin_unlock_irqrestore(&n->list_lock, flags); - if (!ret) - slab_fix(s, "Object at 0x%p not freed", object); - return ret; -} - /* * Parse a block of slub_debug options. Blocks are delimited by ';' * @@ -1673,16 +1642,18 @@ static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, unsigned long addr) { return 0; } + struct slab *slab, void *object, int orig_size) { return 0; } -static inline int free_debug_processing( +static inline void free_debug_processing( struct kmem_cache *s, struct slab *slab, void *head, void *tail, int bulk_cnt, - unsigned long addr) { return 0; } + unsigned long addr) {} static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -1716,22 +1687,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - ptr = kasan_kmalloc_large(ptr, size, flags); - /* As ptr might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ptr, size, 1, flags); - kmsan_kmalloc_large(ptr, size, flags); - return ptr; -} - -static __always_inline void kfree_hook(void *x) -{ - kmemleak_free(x); - kasan_kfree_large(x); - kmsan_kfree_large(x); -} - static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { @@ -1991,11 +1946,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) - goto out; + return NULL; stat(s, ORDER_FALLBACK); } slab->objects = oo_objects(oo); + slab->inuse = 0; + slab->frozen = 0; account_slab(slab, oo_order(oo), s, flags); @@ -2022,15 +1979,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - slab->inuse = slab->objects; - slab->frozen = 1; - -out: - if (!slab) - return NULL; - - inc_slabs_node(s, slab_nid(slab), slab->objects); - return slab; } @@ -2118,6 +2066,75 @@ static inline void remove_partial(struct kmem_cache_node *n, } /* + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move + * it to full list if it was the last free object. + */ +static void *alloc_single_from_partial(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab, int orig_size) +{ + void *object; + + lockdep_assert_held(&n->list_lock); + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse++; + + if (!alloc_debug_processing(s, slab, object, orig_size)) { + remove_partial(n, slab); + return NULL; + } + + if (slab->inuse == slab->objects) { + remove_partial(n, slab); + add_full(s, n, slab); + } + + return object; +} + +/* + * Called only for kmem_cache_debug() caches to allocate from a freshly + * allocated slab. Allocate a single object instead of whole freelist + * and put the slab to the partial (or full) list. + */ +static void *alloc_single_from_new_slab(struct kmem_cache *s, + struct slab *slab, int orig_size) +{ + int nid = slab_nid(slab); + struct kmem_cache_node *n = get_node(s, nid); + unsigned long flags; + void *object; + + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse = 1; + + if (!alloc_debug_processing(s, slab, object, orig_size)) + /* + * It's not really expected that this would fail on a + * freshly allocated slab, but a concurrent memory + * corruption in theory could cause that. + */ + return NULL; + + spin_lock_irqsave(&n->list_lock, flags); + + if (slab->inuse == slab->objects) + add_full(s, n, slab); + else + add_partial(n, slab, DEACTIVATE_TO_HEAD); + + inc_slabs_node(s, nid, slab->objects); + spin_unlock_irqrestore(&n->list_lock, flags); + + return object; +} + +/* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. * @@ -2174,7 +2191,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct slab **ret_slab, gfp_t gfpflags) + struct partial_context *pc) { struct slab *slab, *slab2; void *object = NULL; @@ -2194,15 +2211,23 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { void *t; - if (!pfmemalloc_match(slab, gfpflags)) + if (!pfmemalloc_match(slab, pc->flags)) + continue; + + if (kmem_cache_debug(s)) { + object = alloc_single_from_partial(s, n, slab, + pc->orig_size); + if (object) + break; continue; + } t = acquire_slab(s, n, slab, object == NULL); if (!t) break; if (!object) { - *ret_slab = slab; + *pc->slab = slab; stat(s, ALLOC_FROM_PARTIAL); object = t; } else { @@ -2226,14 +2251,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, /* * Get a slab from somewhere. Search in increasing NUMA distances. */ -static void *get_any_partial(struct kmem_cache *s, gfp_t flags, - struct slab **ret_slab) +static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type highest_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(pc->flags); void *object; unsigned int cpuset_mems_cookie; @@ -2261,15 +2285,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(mempolicy_slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), pc->flags); for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed(zone, flags) && + if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, ret_slab, flags); + object = get_partial_node(s, n, pc); if (object) { /* * Don't check read_mems_allowed_retry() @@ -2290,8 +2314,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, /* * Get a partial slab, lock it and return it. */ -static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, - struct slab **ret_slab) +static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) { void *object; int searchnode = node; @@ -2299,11 +2322,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags); + object = get_partial_node(s, get_node(s, searchnode), pc); if (object || node != NUMA_NO_NODE) return object; - return get_any_partial(s, flags, ret_slab); + return get_any_partial(s, pc); } #ifdef CONFIG_PREEMPTION @@ -2745,7 +2768,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) INIT_WORK(&sfw->work, flush_cpu_slab); sfw->skip = false; sfw->s = s; - schedule_work_on(cpu, &sfw->work); + queue_work_on(cpu, flushwq, &sfw->work); } for_each_online_cpu(cpu) { @@ -2803,6 +2826,113 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) { return atomic_long_read(&n->total_objects); } + +/* Supports checking bulk free of a constructed freelist */ +static noinline void free_debug_processing( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + void *object = head; + int cnt = 0; + unsigned long flags; + bool checks_ok = false; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); + + spin_lock_irqsave(&n->list_lock, flags); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, slab)) + goto out; + } + + if (slab->inuse < bulk_cnt) { + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", + slab->inuse, bulk_cnt); + goto out; + } + +next_object: + + if (++cnt > bulk_cnt) + goto out_cnt; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, slab, object, addr)) + goto out; + } + + if (s->flags & SLAB_STORE_USER) + set_track_update(s, object, TRACK_FREE, addr, handle); + trace(s, slab, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ + init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + checks_ok = true; + +out_cnt: + if (cnt != bulk_cnt) + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", + bulk_cnt, cnt); + +out: + if (checks_ok) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + + if (!prior) { + /* was on full list */ + remove_full(s, n, slab); + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (!checks_ok) + slab_fix(s, "Object at 0x%p not freed", object); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } +} #endif /* CONFIG_SLUB_DEBUG */ #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) @@ -2920,11 +3050,12 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *freelist; struct slab *slab; unsigned long flags; + struct partial_context pc; stat(s, ALLOC_SLOWPATH); @@ -3038,7 +3169,10 @@ new_slab: new_objects: - freelist = get_partial(s, gfpflags, node, &slab); + pc.flags = gfpflags; + pc.slab = &slab; + pc.orig_size = orig_size; + freelist = get_partial(s, node, &pc); if (freelist) goto check_new_slab; @@ -3051,36 +3185,53 @@ new_objects: return NULL; } + stat(s, ALLOC_SLAB); + + if (kmem_cache_debug(s)) { + freelist = alloc_single_from_new_slab(s, slab, orig_size); + + if (unlikely(!freelist)) + goto new_objects; + + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + /* * No other reference to the slab yet so we can * muck around with it freely without cmpxchg */ freelist = slab->freelist; slab->freelist = NULL; + slab->inuse = slab->objects; + slab->frozen = 1; - stat(s, ALLOC_SLAB); + inc_slabs_node(s, slab_nid(slab), slab->objects); check_new_slab: if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, slab, freelist, addr)) { - /* Slab failed checks. Next slab needed */ - goto new_slab; - } else { - /* - * For debug case, we don't load freelist so that all - * allocations go through alloc_debug_processing() - */ - goto return_single; - } + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the tracking info + * and return the object + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; } - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags))) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. */ - goto return_single; + deactivate_slab(s, slab, get_freepointer(s, freelist)); + return freelist; + } retry_load_slab: @@ -3104,11 +3255,6 @@ retry_load_slab: c->slab = slab; goto load_freelist; - -return_single: - - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; } /* @@ -3117,7 +3263,7 @@ return_single: * pointer. */ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *p; @@ -3130,7 +3276,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = slub_get_cpu_ptr(s->cpu_slab); #endif - p = ___slab_alloc(s, gfpflags, node, addr, c); + p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif @@ -3212,16 +3358,10 @@ redo: object = c->freelist; slab = c->slab; - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if a - * slowpath has taken the local_lock_irqsave(), it is not protected - * against a fast path operation in an irq handler. So we need to take - * the slow path which uses local_lock. It is still relatively fast if - * there is a suitable cpu freelist. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) || + + if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c); + object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); } else { void *next_object = get_freepointer_safe(s, object); @@ -3272,8 +3412,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, { void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); - trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size, - s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); return ret; } @@ -3291,46 +3430,24 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller) { - void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size); - trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags); - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; + return slab_alloc_node(s, NULL, gfpflags, node, + caller, orig_size); } -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif -#ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); - trace_kmem_cache_alloc_node(_RET_IP_, ret, s, - s->object_size, s->size, gfpflags, node); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) -{ - void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); - - trace_kmalloc_node(_RET_IP_, ret, s, - size, s->size, gfpflags, node); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif -#endif /* CONFIG_NUMA */ - /* * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -3356,9 +3473,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; - if (kmem_cache_debug(s) && - !free_debug_processing(s, slab, head, tail, cnt, addr)) + if (kmem_cache_debug(s)) { + free_debug_processing(s, slab, head, tail, cnt, addr); return; + } do { if (unlikely(n)) { @@ -3478,6 +3596,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; + void **freelist; redo: /* @@ -3492,9 +3611,13 @@ redo: /* Same with comment on barrier() in slab_alloc_node() */ barrier(); - if (likely(slab == c->slab)) { -#ifndef CONFIG_PREEMPT_RT - void **freelist = READ_ONCE(c->freelist); + if (unlikely(slab != c->slab)) { + __slab_free(s, slab, head, tail_obj, cnt, addr); + return; + } + + if (USE_LOCKLESS_FAST_PATH()) { + freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); @@ -3506,16 +3629,8 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } -#else /* CONFIG_PREEMPT_RT */ - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if - * a slowpath has taken the local_lock_irqsave(), it is not - * protected against a fast path operation in an irq handler. So - * we need to take the local_lock. We shouldn't simply defer to - * __slab_free() as that wouldn't use the cpu freelist at all. - */ - void **freelist; - + } else { + /* Update the free list under the local lock */ local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { @@ -3530,11 +3645,8 @@ redo: c->tid = next_tid(tid); local_unlock(&s->cpu_slab->lock); -#endif - stat(s, FREE_FASTPATH); - } else - __slab_free(s, slab, head, tail_obj, cnt, addr); - + } + stat(s, FREE_FASTPATH); } static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, @@ -3557,12 +3669,17 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) +{ + slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); +} + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); if (!s) return; - trace_kmem_cache_free(_RET_IP_, x, s->name); + trace_kmem_cache_free(_RET_IP_, x, s); slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); @@ -3575,19 +3692,6 @@ struct detached_freelist { struct kmem_cache *s; }; -static inline void free_large_kmalloc(struct folio *folio, void *object) -{ - unsigned int order = folio_order(folio); - - if (WARN_ON_ONCE(order == 0)) - pr_warn_once("object pointer: 0x%p\n", object); - - kfree_hook(object); - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(folio_page(folio, 0), order); -} - /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same @@ -3724,7 +3828,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * of re-populating per CPU c->freelist */ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c); + _RET_IP_, c, s->object_size); if (unlikely(!p[i])) goto error; @@ -3951,6 +4055,7 @@ static void early_kmem_cache_node_alloc(int node) slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!slab); + inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects); if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); @@ -3965,7 +4070,6 @@ static void early_kmem_cache_node_alloc(int node) n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; - slab->frozen = 0; kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); @@ -4127,12 +4231,17 @@ static int calculate_sizes(struct kmem_cache *s) } #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) + if (flags & SLAB_STORE_USER) { /* * Need to store information about allocs and frees after * the object. */ size += 2 * sizeof(struct track); + + /* Save the original kmalloc request size */ + if (flags & SLAB_KMALLOC) + size += sizeof(unsigned int); + } #endif kasan_cache_create(s, &size, &s->flags); @@ -4252,23 +4361,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); - unsigned long flags; - unsigned long *map; void *p; slab_err(s, slab, text, s->name); - slab_lock(slab, &flags); - map = get_map(s, slab); + spin_lock(&object_map_lock); + __fill_map(object_map, s, slab); + for_each_object(p, s, addr, slab->objects) { - if (!test_bit(__obj_to_index(s, addr, p), map)) { + if (!test_bit(__obj_to_index(s, addr, p), object_map)) { pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } - put_map(map); - slab_unlock(slab, &flags); + spin_unlock(&object_map_lock); #endif } @@ -4419,78 +4526,6 @@ static int __init setup_slub_min_objects(char *str) __setup("slub_min_objects=", setup_slub_min_objects); -void *__kmalloc(size_t size, gfp_t flags) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, flags); - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, flags, _RET_IP_, size); - - trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc); - -#ifdef CONFIG_NUMA -static void *kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - struct page *page; - void *ptr = NULL; - unsigned int order = get_order(size); - - flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, order); - if (page) { - ptr = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); - } - - return kmalloc_large_node_hook(ptr, size, flags); -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, flags, node); - - trace_kmalloc_node(_RET_IP_, ret, NULL, - size, PAGE_SIZE << get_order(size), - flags, node); - - return ret; - } - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size); - - trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied @@ -4541,43 +4576,6 @@ void __check_heap_object(const void *ptr, unsigned long n, } #endif /* CONFIG_HARDENED_USERCOPY */ -size_t __ksize(const void *object) -{ - struct folio *folio; - - if (unlikely(object == ZERO_SIZE_PTR)) - return 0; - - folio = virt_to_folio(object); - - if (unlikely(!folio_test_slab(folio))) - return folio_size(folio); - - return slab_ksize(folio_slab(folio)->slab_cache); -} -EXPORT_SYMBOL(__ksize); - -void kfree(const void *x) -{ - struct folio *folio; - struct slab *slab; - void *object = (void *)x; - - trace_kfree(_RET_IP_, x); - - if (unlikely(ZERO_OR_NULL_PTR(x))) - return; - - folio = virt_to_folio(x); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); - return; - } - slab = folio_slab(folio); - slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_); -} -EXPORT_SYMBOL(kfree); - #define SHRINK_PROMOTE_MAX 32 /* @@ -4626,6 +4624,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); n->nr_partial--; + dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) list_move(&slab->slab_list, promote + free - 1); } @@ -4641,7 +4640,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) /* Release empty slabs */ list_for_each_entry_safe(slab, t, &discard, slab_list) - discard_slab(s, slab); + free_slab(s, slab); if (slabs_node(s, node)) ret = 1; @@ -4873,6 +4872,8 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { + flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); + WARN_ON(!flushwq); } struct kmem_cache * @@ -4923,60 +4924,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; } -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, gfpflags); - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, gfpflags, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc(caller, ret, s, size, s->size, gfpflags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_track_caller); - -#ifdef CONFIG_NUMA -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, gfpflags, node); - - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << get_order(size), - gfpflags, node); - - return ret; - } - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif - #ifdef CONFIG_SYSFS static int count_inuse(struct slab *slab) { @@ -4995,12 +4942,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, { void *p; void *addr = slab_address(slab); - unsigned long flags; - - slab_lock(slab, &flags); if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) - goto unlock; + return; /* Now we know that a valid freelist exists */ __fill_map(obj_map, s, slab); @@ -5011,8 +4955,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, if (!check_object(s, slab, p, val)) break; } -unlock: - slab_unlock(slab, &flags); } static int validate_slab_node(struct kmem_cache *s, @@ -5083,6 +5025,7 @@ struct location { depot_stack_handle_t handle; unsigned long count; unsigned long addr; + unsigned long waste; long long sum_time; long min_time; long max_time; @@ -5129,13 +5072,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) } static int add_location(struct loc_track *t, struct kmem_cache *s, - const struct track *track) + const struct track *track, + unsigned int orig_size) { long start, end, pos; struct location *l; - unsigned long caddr, chandle; + unsigned long caddr, chandle, cwaste; unsigned long age = jiffies - track->when; depot_stack_handle_t handle = 0; + unsigned int waste = s->object_size - orig_size; #ifdef CONFIG_STACKDEPOT handle = READ_ONCE(track->handle); @@ -5153,11 +5098,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (pos == end) break; - caddr = t->loc[pos].addr; - chandle = t->loc[pos].handle; - if ((track->addr == caddr) && (handle == chandle)) { + l = &t->loc[pos]; + caddr = l->addr; + chandle = l->handle; + cwaste = l->waste; + if ((track->addr == caddr) && (handle == chandle) && + (waste == cwaste)) { - l = &t->loc[pos]; l->count++; if (track->when) { l->sum_time += age; @@ -5182,6 +5129,9 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, end = pos; else if (track->addr == caddr && handle < chandle) end = pos; + else if (track->addr == caddr && handle == chandle && + waste < cwaste) + end = pos; else start = pos; } @@ -5205,6 +5155,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->min_pid = track->pid; l->max_pid = track->pid; l->handle = handle; + l->waste = waste; cpumask_clear(to_cpumask(l->cpus)); cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); @@ -5217,13 +5168,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, unsigned long *obj_map) { void *addr = slab_address(slab); + bool is_alloc = (alloc == TRACK_ALLOC); void *p; __fill_map(obj_map, s, slab); for_each_object(p, s, addr, slab->objects) if (!test_bit(__obj_to_index(s, addr, p), obj_map)) - add_location(t, s, get_track(s, p, alloc)); + add_location(t, s, get_track(s, p, alloc), + is_alloc ? get_orig_size(s, p) : + s->object_size); } #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ @@ -5616,7 +5570,7 @@ static ssize_t validate_store(struct kmem_cache *s, { int ret = -EINVAL; - if (buf[0] == '1') { + if (buf[0] == '1' && kmem_cache_debug(s)) { ret = validate_slab_cache(s); if (ret >= 0) ret = length; @@ -5867,7 +5821,6 @@ static ssize_t slab_attr_show(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5875,9 +5828,7 @@ static ssize_t slab_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - err = attribute->show(s, buf); - - return err; + return attribute->show(s, buf); } static ssize_t slab_attr_store(struct kobject *kobj, @@ -5886,7 +5837,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5894,8 +5844,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - err = attribute->store(s, buf, len); - return err; + return attribute->store(s, buf, len); } static void kmem_cache_release(struct kobject *k) @@ -5920,7 +5869,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) return slab_kset; } -#define ID_STR_LENGTH 64 +#define ID_STR_LENGTH 32 /* Create a unique string id for a slab cache: * @@ -5931,7 +5880,8 @@ static char *create_unique_id(struct kmem_cache *s) char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); char *p = name; - BUG_ON(!name); + if (!name) + return ERR_PTR(-ENOMEM); *p++ = ':'; /* @@ -5953,9 +5903,12 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'A'; if (p != name + 1) *p++ = '-'; - p += sprintf(p, "%07u", s->size); + p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size); - BUG_ON(p > name + ID_STR_LENGTH - 1); + if (WARN_ON(p > name + ID_STR_LENGTH - 1)) { + kfree(name); + return ERR_PTR(-EINVAL); + } kmsan_unpoison_memory(name, p - name); return name; } @@ -5990,6 +5943,8 @@ static int sysfs_slab_add(struct kmem_cache *s) * for the symlinks. */ name = create_unique_id(s); + if (IS_ERR(name)) + return PTR_ERR(name); } s->kobj.kset = kset; @@ -6121,6 +6076,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v) else seq_puts(seq, "<not-available>"); + if (l->waste) + seq_printf(seq, " waste=%lu/%lu", + l->count * l->waste, l->waste); + if (l->sum_time != l->min_time) { seq_printf(seq, " age=%ld/%llu/%ld", l->min_time, div_u64(l->sum_time, l->count), diff --git a/mm/swapfile.c b/mm/swapfile.c index 4efcfe34e45b..5fc1237a9f21 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3650,7 +3650,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { if (si->bdev) { - blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); + blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } diff --git a/mm/util.c b/mm/util.c index 5cd3f7910f2c..12984e76767e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -587,6 +587,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + /* Don't even allow crazy sizes */ if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); diff --git a/mm/vmstat.c b/mm/vmstat.c index bd8040f25c27..b2371d745e00 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -354,8 +354,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, * CPU migrations and preemption potentially corrupts a counter so * disable preemption. */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -367,8 +366,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } __this_cpu_write(*p, x); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -392,8 +390,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -405,8 +402,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } __this_cpu_write(*p, x); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_node_page_state); @@ -440,8 +436,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 v, t; /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -452,8 +447,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) __this_cpu_write(*p, -overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -465,8 +459,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -477,8 +470,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) __this_cpu_write(*p, -overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) @@ -500,8 +492,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 v, t; /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -512,8 +503,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) __this_cpu_write(*p, overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -525,8 +515,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -537,8 +526,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) __this_cpu_write(*p, overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) @@ -1246,6 +1234,7 @@ const char * const vmstat_text[] = { "nr_shadow_call_stack", #endif "nr_page_table_pages", + "nr_sec_page_table_pages", #ifdef CONFIG_SWAP "nr_swapcached", #endif |