diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 6 | ||||
-rw-r--r-- | mm/backing-dev.c | 25 | ||||
-rw-r--r-- | mm/compaction.c | 7 | ||||
-rw-r--r-- | mm/filemap.c | 18 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/kasan/generic.c | 22 | ||||
-rw-r--r-- | mm/memblock.c | 5 | ||||
-rw-r--r-- | mm/mmap.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 32 | ||||
-rw-r--r-- | mm/shmem.c | 52 | ||||
-rw-r--r-- | mm/slab.h | 11 | ||||
-rw-r--r-- | mm/slab_common.c | 29 | ||||
-rw-r--r-- | mm/slub.c | 116 | ||||
-rw-r--r-- | mm/swapfile.c | 22 | ||||
-rw-r--r-- | mm/userfaultfd.c | 6 | ||||
-rw-r--r-- | mm/util.c | 17 | ||||
-rw-r--r-- | mm/vmalloc.c | 83 | ||||
-rw-r--r-- | mm/vmscan.c | 5 | ||||
-rw-r--r-- | mm/workingset.c | 1 |
19 files changed, 284 insertions, 187 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 321ab379994f..afc72fde0f03 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -64,11 +64,11 @@ config SLUB_DEBUG_ON help Boot with debugging on by default. SLUB boots by default with the runtime debug capabilities switched off. Enabling this is - equivalent to specifying the "slub_debug" parameter on boot. + equivalent to specifying the "slab_debug" parameter on boot. There is no support for more fine grained debug control like - possible with slub_debug=xxx. SLUB debugging may be switched + possible with slab_debug=xxx. SLUB debugging may be switched off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying - "slub_debug=-". + "slab_debug=-". config PAGE_OWNER bool "Track page owner" diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e039d05304dd..5f2be8c8df11 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -372,31 +372,6 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -/* - * This function is used when the first inode for this wb is marked dirty. It - * wakes-up the corresponding bdi thread which should then take care of the - * periodic background write-out of dirty inodes. Since the write-out would - * starts only 'dirty_writeback_interval' centisecs from now anyway, we just - * set up a timer which wakes the bdi thread up later. - * - * Note, we wouldn't bother setting up the timer, but this function is on the - * fast-path (used by '__mark_inode_dirty()'), so we save few context switches - * by delaying the wake-up. - * - * We have to be careful not to postpone flush work if it is scheduled for - * earlier. Thus we use queue_delayed_work(). - */ -void wb_wakeup_delayed(struct bdi_writeback *wb) -{ - unsigned long timeout; - - timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_irq(&wb->work_lock); - if (test_bit(WB_registered, &wb->state)) - queue_delayed_work(bdi_wq, &wb->dwork, timeout); - spin_unlock_irq(&wb->work_lock); -} - static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), diff --git a/mm/compaction.c b/mm/compaction.c index 1faeffb28720..807b58e6eb68 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2827,16 +2827,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, struct page **capture) { - int may_perform_io = (__force int)(gfp_mask & __GFP_IO); struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* - * Check if the GFP flags allow compaction - GFP_NOIO is really - * tricky context because the migration might require IO - */ - if (!may_perform_io) + if (!gfp_compaction_allowed(gfp_mask)) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); diff --git a/mm/filemap.c b/mm/filemap.c index 31ab455c4537..7437b2bd75c1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,6 +124,15 @@ * ->private_lock (zap_pte_range->block_dirty_folio) */ +static void mapping_set_update(struct xa_state *xas, + struct address_space *mapping) +{ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + xas_set_update(xas, workingset_update_node); + xas_set_lru(xas, &shadow_nodes); +} + static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { @@ -2607,15 +2616,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* - * Pairs with a barrier in - * block_write_end()->mark_buffer_dirty() or other page - * dirtying routines like iomap_write_end() to ensure - * changes to page contents are visible before we see - * increased inode size. - */ - smp_rmb(); - - /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ diff --git a/mm/internal.h b/mm/internal.h index d1c69119b24f..7e486f2c502c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1371,4 +1371,8 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, } #endif /* CONFIG_SHRINKER_DEBUG */ +/* Only track the nodes of mappings with shadow entries */ +void workingset_update_node(struct xa_node *node); +extern struct list_lru shadow_nodes; + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 1900f8576034..6310a180278b 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -334,14 +334,6 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); -/* Only allow cache merging when no per-object metadata is present. */ -slab_flags_t kasan_never_merge(void) -{ - if (!kasan_requires_meta()) - return 0; - return SLAB_KASAN; -} - /* * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. @@ -370,15 +362,13 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, return; /* - * SLAB_KASAN is used to mark caches that are sanitized by KASAN - * and that thus have per-object metadata. - * Currently this flag is used in two places: - * 1. In slab_ksize() to account for per-object metadata when - * calculating the size of the accessible memory within the object. - * 2. In slab_common.c via kasan_never_merge() to prevent merging of - * caches with per-object metadata. + * SLAB_KASAN is used to mark caches that are sanitized by KASAN and + * that thus have per-object metadata. Currently, this flag is used in + * slab_ksize() to account for per-object metadata when calculating the + * size of the accessible memory within the object. Additionally, we use + * SLAB_NO_MERGE to prevent merging of caches with per-object metadata. */ - *flags |= SLAB_KASAN; + *flags |= SLAB_KASAN | SLAB_NO_MERGE; ok_size = *size; diff --git a/mm/memblock.c b/mm/memblock.c index d9f4b82cbffe..d09136e040d3 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -180,8 +180,9 @@ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) /* * Address comparison utilities */ -static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, - phys_addr_t base2, phys_addr_t size2) +unsigned long __init_memblock +memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, + phys_addr_t size2) { return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } diff --git a/mm/mmap.c b/mm/mmap.c index ccf377ee319f..04da02114c6f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -955,13 +955,21 @@ static struct vm_area_struct } else if (merge_prev) { /* case 2 */ if (curr) { vma_start_write(curr); - err = dup_anon_vma(prev, curr, &anon_dup); if (end == curr->vm_end) { /* case 7 */ + /* + * can_vma_merge_after() assumed we would not be + * removing prev vma, so it skipped the check + * for vm_ops->close, but we are removing curr + */ + if (curr->vm_ops && curr->vm_ops->close) + err = -EINVAL; remove = curr; } else { /* case 5 */ adjust = curr; adj_start = (end - curr->vm_start); } + if (!err) + err = dup_anon_vma(prev, curr, &anon_dup); } } else { /* merge_next */ vma_start_write(next); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cc7f9b322193..14d39f34d336 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4047,6 +4047,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + bool can_compact = gfp_compaction_allowed(gfp_mask); const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; @@ -4117,7 +4118,7 @@ restart: * Don't try this for allocations that are allowed to ignore * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && + if (can_direct_reclaim && can_compact && (costly_order || (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) && !gfp_pfmemalloc_allowed(gfp_mask)) { @@ -4215,9 +4216,10 @@ retry: /* * Do not retry costly high order allocations unless they are - * __GFP_RETRY_MAYFAIL + * __GFP_RETRY_MAYFAIL and we can compact */ - if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) + if (costly_order && (!can_compact || + !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -4230,7 +4232,7 @@ retry: * implementation of the compaction depends on the sufficient amount * of free memory (see __compaction_suitable) */ - if (did_some_progress > 0 && + if (did_some_progress > 0 && can_compact && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, &compaction_retries)) @@ -4691,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, gfp_t gfp = gfp_mask; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC; + gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | + __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER); nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; @@ -4705,6 +4707,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, return page; } +void page_frag_cache_drain(struct page_frag_cache *nc) +{ + if (!nc->va) + return; + + __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); + nc->va = NULL; +} +EXPORT_SYMBOL(page_frag_cache_drain); + void __page_frag_cache_drain(struct page *page, unsigned int count) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); @@ -4714,9 +4726,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) } EXPORT_SYMBOL(__page_frag_cache_drain); -void *page_frag_alloc_align(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask, - unsigned int align_mask) +void *__page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -4785,7 +4797,7 @@ refill: return nc->va + offset; } -EXPORT_SYMBOL(page_frag_alloc_align); +EXPORT_SYMBOL(__page_frag_alloc_align); /* * Frees a page fragment allocated out of either a compound or order 0 page. diff --git a/mm/shmem.c b/mm/shmem.c index 30c9dc862505..0aad0d9a621b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -254,7 +254,7 @@ static void shmem_inode_unacct_blocks(struct inode *inode, long pages) } static const struct super_operations shmem_ops; -const struct address_space_operations shmem_aops; +static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; @@ -263,6 +263,12 @@ static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->a_ops == &shmem_aops; +} +EXPORT_SYMBOL_GPL(shmem_mapping); + bool vma_is_anon_shmem(struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; @@ -311,7 +317,7 @@ static void shmem_disable_quotas(struct super_block *sb) dquot_quota_off(sb, type); } -static struct dquot **shmem_get_dquots(struct inode *inode) +static struct dquot __rcu **shmem_get_dquots(struct inode *inode) { return SHMEM_I(inode)->i_dquot; } @@ -1966,6 +1972,9 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, int error; bool alloced; + if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) + return -EINVAL; + if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: @@ -2128,12 +2137,36 @@ unlock: return error; } +/** + * shmem_get_folio - find, and lock a shmem folio. + * @inode: inode to search + * @index: the page index. + * @foliop: pointer to the folio if found + * @sgp: SGP_* flags to control behavior + * + * Looks up the page cache entry at @inode & @index. If a folio is + * present, it is returned locked with an increased refcount. + * + * If the caller modifies data in the folio, it must call folio_mark_dirty() + * before unlocking the folio to ensure that the folio is not reclaimed. + * There is no need to reserve space before calling folio_mark_dirty(). + * + * When no folio is found, the behavior depends on @sgp: + * - for SGP_READ, *@foliop is %NULL and 0 is returned + * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned + * - for all other flags a new folio is allocated, inserted into the + * page cache and returned locked in @foliop. + * + * Context: May sleep. + * Return: 0 if successful, else a negative error code. + */ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } +EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue @@ -3374,7 +3407,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { - if (!simple_empty(dentry)) + if (!simple_offset_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); @@ -3431,7 +3464,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - if (!simple_empty(new_dentry)) + if (!simple_offset_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { @@ -3500,10 +3533,10 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); + inode->i_mapping->a_ops = &shmem_aops; error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; - inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); @@ -4373,7 +4406,9 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif - uuid_gen(&sb->s_uuid); + uuid_t uuid; + uuid_gen(&uuid); + super_set_uuid(sb, uuid.b, sizeof(uuid)); #ifdef CONFIG_TMPFS_QUOTA if (ctx->seen & SHMEM_SEEN_QUOTA) { @@ -4484,7 +4519,7 @@ static int shmem_error_remove_folio(struct address_space *mapping, return 0; } -const struct address_space_operations shmem_aops = { +static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS @@ -4496,7 +4531,6 @@ const struct address_space_operations shmem_aops = { #endif .error_remove_folio = shmem_error_remove_folio, }; -EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, @@ -4851,6 +4885,7 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } +EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs @@ -4928,7 +4963,6 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - BUG_ON(!shmem_mapping(mapping)); error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, gfp, NULL, NULL); if (error) diff --git a/mm/slab.h b/mm/slab.h index 54deeb0428c6..d2bc9b191222 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -363,7 +363,6 @@ static inline int objs_per_slab(const struct kmem_cache *cache, enum slab_state { DOWN, /* No slab functionality yet */ PARTIAL, /* SLUB: kmem_cache_node available */ - PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ UP, /* Slab caches usable but not all extras yet */ FULL /* Everything is working */ }; @@ -387,7 +386,7 @@ extern const struct kmalloc_info_struct { /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); -void create_kmalloc_caches(slab_flags_t); +void create_kmalloc_caches(void); extern u8 kmalloc_size_index[24]; @@ -422,8 +421,6 @@ gfp_t kmalloc_fix_flags(gfp_t flags); int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); void __init kmem_cache_init(void); -void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, - slab_flags_t flags); extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int size, slab_flags_t flags, unsigned int useroffset, unsigned int usersize); @@ -435,8 +432,7 @@ struct kmem_cache * __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name); +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name); static inline bool is_kmalloc_cache(struct kmem_cache *s) { @@ -469,7 +465,6 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s) SLAB_STORE_USER | \ SLAB_TRACE | \ SLAB_CONSISTENCY_CHECKS | \ - SLAB_MEM_SPREAD | \ SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ @@ -528,7 +523,7 @@ static inline bool __slub_debug_enabled(void) #endif /* - * Returns true if any of the specified slub_debug flags is enabled for the + * Returns true if any of the specified slab_debug flags is enabled for the * cache. Use only for flags parsed by setup_slub_debug() as it also enables * the static key. */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 238293b1dbe1..23af762148ca 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -50,7 +50,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB | SLAB_NO_MERGE | kasan_never_merge()) + SLAB_FAILSLAB | SLAB_NO_MERGE) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_ACCOUNT) @@ -172,7 +172,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, size = ALIGN(size, sizeof(void *)); align = calculate_alignment(flags, align, size); size = ALIGN(size, align); - flags = kmem_cache_flags(size, flags, name); + flags = kmem_cache_flags(flags, name); if (flags & SLAB_NEVER_MERGE) return NULL; @@ -282,7 +282,7 @@ kmem_cache_create_usercopy(const char *name, #ifdef CONFIG_SLUB_DEBUG /* - * If no slub_debug was enabled globally, the static key is not yet + * If no slab_debug was enabled globally, the static key is not yet * enabled by setup_slub_debug(). Enable it if the cache is being * created with any of the debugging flags passed explicitly. * It's also possible that this is the first cache created with @@ -404,8 +404,12 @@ EXPORT_SYMBOL(kmem_cache_create); */ static void kmem_cache_release(struct kmem_cache *s) { - sysfs_slab_unlink(s); - sysfs_slab_release(s); + if (slab_state >= FULL) { + sysfs_slab_unlink(s); + sysfs_slab_release(s); + } else { + slab_kmem_cache_release(s); + } } #else static void kmem_cache_release(struct kmem_cache *s) @@ -766,7 +770,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup); } /* - * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. + * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time. * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is * kmalloc-2M. */ @@ -853,9 +857,10 @@ static unsigned int __kmalloc_minalign(void) return max(minalign, arch_slab_minalign()); } -void __init -new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) +static void __init +new_kmalloc_cache(int idx, enum kmalloc_cache_type type) { + slab_flags_t flags = 0; unsigned int minalign = __kmalloc_minalign(); unsigned int aligned_size = kmalloc_info[idx].size; int aligned_idx = idx; @@ -902,7 +907,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) * may already have been created because they were needed to * enable allocations for slab creation. */ -void __init create_kmalloc_caches(slab_flags_t flags) +void __init create_kmalloc_caches(void) { int i; enum kmalloc_cache_type type; @@ -913,7 +918,7 @@ void __init create_kmalloc_caches(slab_flags_t flags) for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[type][i]) - new_kmalloc_cache(i, type, flags); + new_kmalloc_cache(i, type); /* * Caches that are not of the two-to-the-power-of size. @@ -922,10 +927,10 @@ void __init create_kmalloc_caches(slab_flags_t flags) */ if (KMALLOC_MIN_SIZE <= 32 && i == 6 && !kmalloc_caches[type][1]) - new_kmalloc_cache(1, type, flags); + new_kmalloc_cache(1, type); if (KMALLOC_MIN_SIZE <= 64 && i == 7 && !kmalloc_caches[type][2]) - new_kmalloc_cache(2, type, flags); + new_kmalloc_cache(2, type); } } #ifdef CONFIG_RANDOM_KMALLOC_CACHES diff --git a/mm/slub.c b/mm/slub.c index 2ef88bbf56a3..1bb2a93cf7b6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -295,7 +295,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* * Debugging flags that require metadata to be stored in the slab. These get - * disabled when slub_debug=O is used and a cache's min order increases with + * disabled when slab_debug=O is used and a cache's min order increases with * metadata. */ #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) @@ -306,13 +306,13 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Internal SLUB flags */ /* Poison object */ -#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) +#define __OBJECT_POISON __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON) /* Use cmpxchg_double */ #ifdef system_has_freelist_aba -#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) +#define __CMPXCHG_DOUBLE __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE) #else -#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U) +#define __CMPXCHG_DOUBLE __SLAB_FLAG_UNUSED #endif /* @@ -391,7 +391,7 @@ struct kmem_cache_cpu { }; struct slab *slab; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated frozen slabs */ + struct slab *partial; /* Partially allocated slabs */ #endif local_lock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS @@ -1498,16 +1498,8 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); - /* - * May be called early in order to allocate a slab for the - * kmem_cache_node structure. Solve the chicken-egg - * dilemma by deferring the increment of the count during - * bootstrap (see early_kmem_cache_node_alloc). - */ - if (likely(n)) { - atomic_long_inc(&n->nr_slabs); - atomic_long_add(objects, &n->total_objects); - } + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); } static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) { @@ -1616,7 +1608,7 @@ static inline int free_consistency_checks(struct kmem_cache *s, } /* - * Parse a block of slub_debug options. Blocks are delimited by ';' + * Parse a block of slab_debug options. Blocks are delimited by ';' * * @str: start of block * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified @@ -1677,7 +1669,7 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) break; default: if (init) - pr_err("slub_debug option '%c' unknown. skipped\n", *str); + pr_err("slab_debug option '%c' unknown. skipped\n", *str); } } check_slabs: @@ -1736,7 +1728,7 @@ static int __init setup_slub_debug(char *str) /* * For backwards compatibility, a single list of flags with list of * slabs means debugging is only changed for those slabs, so the global - * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as * long as there is no option specifying flags without a slab list. */ @@ -1760,21 +1752,20 @@ out: return 1; } -__setup("slub_debug", setup_slub_debug); +__setup("slab_debug", setup_slub_debug); +__setup_param("slub_debug", slub_debug, setup_slub_debug, 0); /* * kmem_cache_flags - apply debugging options to the cache - * @object_size: the size of an object without meta data * @flags: flags to set * @name: name of the cache * * Debug option(s) are applied to @flags. In addition to the debug * option(s), if a slab name (or multiple) is specified i.e. - * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ... + * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ... * then only the select slabs will receive the debug option(s). */ -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name) +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { char *iter; size_t len; @@ -1850,8 +1841,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name) +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { return flags; } @@ -2038,11 +2028,6 @@ void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, obj_cgroup_uncharge(objcg, objects * obj_full_size(s)); } #else /* CONFIG_MEMCG_KMEM */ -static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) -{ - return NULL; -} - static inline void memcg_free_slab_cgroups(struct slab *slab) { } @@ -2243,7 +2228,7 @@ static void __init init_freelist_randomization(void) } /* Get the next entry on the pre-computed freelist randomized */ -static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab, +static void *next_freelist_entry(struct kmem_cache *s, unsigned long *pos, void *start, unsigned long page_limit, unsigned long freelist_count) @@ -2282,13 +2267,12 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) start = fixup_red_left(s, slab_address(slab)); /* First entry is used as the base of the freelist */ - cur = next_freelist_entry(s, slab, &pos, start, page_limit, - freelist_count); + cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count); cur = setup_object(s, cur); slab->freelist = cur; for (idx = 1; idx < slab->objects; idx++) { - next = next_freelist_entry(s, slab, &pos, start, page_limit, + next = next_freelist_entry(s, &pos, start, page_limit, freelist_count); next = setup_object(s, next); set_freepointer(s, cur, next); @@ -3263,7 +3247,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) oo_order(s->min)); if (oo_order(s->min) > get_order(s->object_size)) - pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + pr_warn(" %s debugging increased min order, use slab_debug=O to disable.\n", s->name); for_each_kmem_cache_node(s, node, n) { @@ -3326,7 +3310,6 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) counters = slab->counters; new.counters = counters; - VM_BUG_ON(!new.frozen); new.inuse = slab->objects; new.frozen = freelist != NULL; @@ -3498,18 +3481,20 @@ new_slab: slab = slub_percpu_partial(c); slub_set_percpu_partial(c, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - stat(s, CPU_PARTIAL_ALLOC); - if (unlikely(!node_match(slab, node) || - !pfmemalloc_match(slab, gfpflags))) { - slab->next = NULL; - __put_partials(s, slab); - continue; + if (likely(node_match(slab, node) && + pfmemalloc_match(slab, gfpflags))) { + c->slab = slab; + freelist = get_freelist(s, slab); + VM_BUG_ON(!freelist); + stat(s, CPU_PARTIAL_ALLOC); + goto load_freelist; } - freelist = freeze_slab(s, slab); - goto retry_load_slab; + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + slab->next = NULL; + __put_partials(s, slab); } #endif @@ -3792,11 +3777,11 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, zero_size = orig_size; /* - * When slub_debug is enabled, avoid memory initialization integrated + * When slab_debug is enabled, avoid memory initialization integrated * into KASAN and instead zero out the memory via the memset below with * the proper size. Otherwise, KASAN might overwrite SLUB redzones and * cause false-positive reports. This does not lead to a performance - * penalty on production builds, as slub_debug is not intended to be + * penalty on production builds, as slab_debug is not intended to be * enabled there. */ if (__slub_debug_enabled()) @@ -4187,7 +4172,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * then add it. */ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { - remove_full(s, n, slab); add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -4201,9 +4185,6 @@ slab_empty: */ remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); - } else { - /* Slab must be on the full list */ - remove_full(s, n, slab); } spin_unlock_irqrestore(&n->list_lock, flags); @@ -4702,8 +4683,8 @@ static unsigned int slub_min_objects; * activity on the partial lists which requires taking the list_lock. This is * less a concern for large slabs though which are rarely used. * - * slub_max_order specifies the order where we begin to stop considering the - * number of objects in a slab as critical. If we reach slub_max_order then + * slab_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slab_max_order then * we try to keep the page order as low as possible. So we accept more waste * of space in favor of a small page order. * @@ -4770,14 +4751,14 @@ static inline int calculate_order(unsigned int size) * and backing off gradually. * * We start with accepting at most 1/16 waste and try to find the - * smallest order from min_objects-derived/slub_min_order up to - * slub_max_order that will satisfy the constraint. Note that increasing + * smallest order from min_objects-derived/slab_min_order up to + * slab_max_order that will satisfy the constraint. Note that increasing * the order can only result in same or less fractional waste, not more. * * If that fails, we increase the acceptable fraction of waste and try * again. The last iteration with fraction of 1/2 would effectively * accept any waste and give us the order determined by min_objects, as - * long as at least single object fits within slub_max_order. + * long as at least single object fits within slab_max_order. */ for (unsigned int fraction = 16; fraction > 1; fraction /= 2) { order = calc_slab_order(size, min_order, slub_max_order, @@ -4787,7 +4768,7 @@ static inline int calculate_order(unsigned int size) } /* - * Doh this slab cannot be placed using slub_max_order. + * Doh this slab cannot be placed using slab_max_order. */ order = get_order(size); if (order <= MAX_PAGE_ORDER) @@ -4857,7 +4838,6 @@ static void early_kmem_cache_node_alloc(int node) slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!slab); - inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects); if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); @@ -5104,7 +5084,7 @@ static int calculate_sizes(struct kmem_cache *s) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { - s->flags = kmem_cache_flags(s->size, flags, s->name); + s->flags = kmem_cache_flags(flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif @@ -5313,7 +5293,9 @@ static int __init setup_slub_min_order(char *str) return 1; } -__setup("slub_min_order=", setup_slub_min_order); +__setup("slab_min_order=", setup_slub_min_order); +__setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0); + static int __init setup_slub_max_order(char *str) { @@ -5326,7 +5308,8 @@ static int __init setup_slub_max_order(char *str) return 1; } -__setup("slub_max_order=", setup_slub_max_order); +__setup("slab_max_order=", setup_slub_max_order); +__setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0); static int __init setup_slub_min_objects(char *str) { @@ -5335,7 +5318,8 @@ static int __init setup_slub_min_objects(char *str) return 1; } -__setup("slub_min_objects=", setup_slub_min_objects); +__setup("slab_min_objects=", setup_slub_min_objects); +__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); #ifdef CONFIG_HARDENED_USERCOPY /* @@ -5663,7 +5647,7 @@ void __init kmem_cache_init(void) /* Now we can use the kmem_cache to allocate kmalloc slabs */ setup_kmalloc_cache_index_table(); - create_kmalloc_caches(0); + create_kmalloc_caches(); /* Setup random freelists for each cache */ init_freelist_randomization(); @@ -6792,14 +6776,12 @@ out_del_kobj: void sysfs_slab_unlink(struct kmem_cache *s) { - if (slab_state >= FULL) - kobject_del(&s->kobj); + kobject_del(&s->kobj); } void sysfs_slab_release(struct kmem_cache *s) { - if (slab_state >= FULL) - kobject_put(&s->kobj); + kobject_put(&s->kobj); } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 1155a6304119..4919423cce76 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2552,10 +2552,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) exit_swap_address_space(p->type); inode = mapping->host; - if (p->bdev_handle) { + if (p->bdev_file) { set_blocksize(p->bdev, old_block_size); - bdev_release(p->bdev_handle); - p->bdev_handle = NULL; + fput(p->bdev_file); + p->bdev_file = NULL; } inode_lock(inode); @@ -2785,14 +2785,14 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) int error; if (S_ISBLK(inode->i_mode)) { - p->bdev_handle = bdev_open_by_dev(inode->i_rdev, + p->bdev_file = bdev_file_open_by_dev(inode->i_rdev, BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL); - if (IS_ERR(p->bdev_handle)) { - error = PTR_ERR(p->bdev_handle); - p->bdev_handle = NULL; + if (IS_ERR(p->bdev_file)) { + error = PTR_ERR(p->bdev_file); + p->bdev_file = NULL; return error; } - p->bdev = p->bdev_handle->bdev; + p->bdev = file_bdev(p->bdev_file); p->old_block_size = block_size(p->bdev); error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) @@ -3234,10 +3234,10 @@ bad_swap: p->percpu_cluster = NULL; free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; - if (p->bdev_handle) { + if (p->bdev_file) { set_blocksize(p->bdev, p->old_block_size); - bdev_release(p->bdev_handle); - p->bdev_handle = NULL; + fput(p->bdev_file); + p->bdev_file = NULL; } inode = NULL; destroy_swap_extents(p); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a0331ba9ae2a..712160cd41ec 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1017,9 +1017,6 @@ static int move_present_pte(struct mm_struct *mm, goto out; } - folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); - orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) { @@ -1028,6 +1025,9 @@ static int move_present_pte(struct mm_struct *mm, goto out; } + folio_move_anon_rmap(src_folio, dst_vma); + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); diff --git a/mm/util.c b/mm/util.c index 6710efaf1802..669397235787 100644 --- a/mm/util.c +++ b/mm/util.c @@ -136,6 +136,23 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) EXPORT_SYMBOL(kmemdup); /** + * kmemdup_array - duplicate a given array. + * + * @src: array to duplicate. + * @element_size: size of each element of array. + * @count: number of elements to duplicate from array. + * @gfp: GFP mask to use. + * + * Return: duplicated array of @src or %NULL in case of error, + * result is physically contiguous. Use kfree() to free. + */ +void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +{ + return kmemdup(src, size_mul(element_size, count), gfp); +} +EXPORT_SYMBOL(kmemdup_array); + +/** * kvmemdup - duplicate region of memory * * @src: memory region to duplicate diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 25a8df497255..22aa63f4ef63 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -304,8 +304,8 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end, return err; } -int ioremap_page_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) +int vmap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { int err; @@ -318,6 +318,26 @@ int ioremap_page_range(unsigned long addr, unsigned long end, return err; } +int ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + struct vm_struct *area; + + area = find_vm_area((void *)addr); + if (!area || !(area->flags & VM_IOREMAP)) { + WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr); + return -EINVAL; + } + if (addr != (unsigned long)area->addr || + (void *)end != area->addr + get_vm_area_size(area)) { + WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n", + addr, end, (long)area->addr, + (long)area->addr + get_vm_area_size(area)); + return -ERANGE; + } + return vmap_page_range(addr, end, phys_addr, prot); +} + static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { @@ -635,6 +655,58 @@ static int vmap_pages_range(unsigned long addr, unsigned long end, return err; } +static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, + unsigned long end) +{ + might_sleep(); + if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS)) + return -EINVAL; + if (WARN_ON_ONCE(area->flags & VM_NO_GUARD)) + return -EINVAL; + if (WARN_ON_ONCE(!(area->flags & VM_SPARSE))) + return -EINVAL; + if ((end - start) >> PAGE_SHIFT > totalram_pages()) + return -E2BIG; + if (start < (unsigned long)area->addr || + (void *)end > area->addr + get_vm_area_size(area)) + return -ERANGE; + return 0; +} + +/** + * vm_area_map_pages - map pages inside given sparse vm_area + * @area: vm_area + * @start: start address inside vm_area + * @end: end address inside vm_area + * @pages: pages to map (always PAGE_SIZE pages) + */ +int vm_area_map_pages(struct vm_struct *area, unsigned long start, + unsigned long end, struct page **pages) +{ + int err; + + err = check_sparse_vm_area(area, start, end); + if (err) + return err; + + return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT); +} + +/** + * vm_area_unmap_pages - unmap pages inside given sparse vm_area + * @area: vm_area + * @start: start address inside vm_area + * @end: end address inside vm_area + */ +void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, + unsigned long end) +{ + if (check_sparse_vm_area(area, start, end)) + return; + + vunmap_range(start, end); +} + int is_vmalloc_or_module_addr(const void *x) { /* @@ -4220,9 +4292,9 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if (flags & VMAP_RAM) copied = vmap_ram_vread_iter(iter, addr, n, flags); - else if (!(vm && (vm->flags & VM_IOREMAP))) + else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE)))) copied = aligned_vread_iter(iter, addr, n); - else /* IOREMAP area is treated as memory hole */ + else /* IOREMAP | SPARSE area is treated as memory hole */ copied = zero_iter(iter, n); addr += copied; @@ -4810,6 +4882,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p) if (v->flags & VM_IOREMAP) seq_puts(m, " ioremap"); + if (v->flags & VM_SPARSE) + seq_puts(m, " sparse"); + if (v->flags & VM_ALLOC) seq_puts(m, " vmalloc"); diff --git a/mm/vmscan.c b/mm/vmscan.c index 741f16929e72..3ef654addd44 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5759,7 +5759,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -6006,6 +6006,9 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; + if (!gfp_compaction_allowed(sc->gfp_mask)) + return false; + /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0)) diff --git a/mm/workingset.c b/mm/workingset.c index 226012974328..f2a0ecaf708d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -16,6 +16,7 @@ #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> +#include "internal.h" /* * Double CLOCK lists |