summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/damon/core.c2
-rw-r--r--mm/filemap.c57
-rw-r--r--mm/gup.c58
-rw-r--r--mm/gup_test.c1
-rw-r--r--mm/internal.h6
-rw-r--r--mm/kfence/kfence.h2
-rw-r--r--mm/khugepaged.c1
-rw-r--r--mm/memblock.c9
-rw-r--r--mm/memfd.c9
-rw-r--r--mm/mm_init.c7
-rw-r--r--mm/mmap.c37
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/page_alloc.c173
-rw-r--r--mm/page_io.c8
-rw-r--r--mm/page_table_check.c6
-rw-r--r--mm/shmem.c134
-rw-r--r--mm/shrinker_debug.c54
-rw-r--r--mm/slab.h53
-rw-r--r--mm/slub.c139
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/vmalloc.c17
-rw-r--r--mm/vmscan.c130
-rw-r--r--mm/vmstat.c3
-rw-r--r--mm/zsmalloc.c36
-rw-r--r--mm/zswap.c27
26 files changed, 731 insertions, 247 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index a925415b4d10..018a5bd2f576 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -98,6 +98,7 @@ config PAGE_OWNER
config PAGE_TABLE_CHECK
bool "Check for invalid mappings in user page tables"
depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ depends on EXCLUSIVE_SYSTEM_RAM
select PAGE_EXTENSION
help
Check that anonymous page is not being mapped twice with read write
diff --git a/mm/damon/core.c b/mm/damon/core.c
index d9ef62047bf5..91cff7f2997e 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -551,6 +551,8 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
return -EINVAL;
if (attrs->min_nr_regions > attrs->max_nr_regions)
return -EINVAL;
+ if (attrs->sample_interval > attrs->aggr_interval)
+ return -EINVAL;
damon_update_monitoring_results(ctx, attrs);
ctx->attrs = *attrs;
diff --git a/mm/filemap.c b/mm/filemap.c
index b4c9bd368b7e..00f01d8ead47 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1760,7 +1760,9 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'return - index >= max_scan' will be true).
- * In the rare case of index wrap-around, 0 will be returned.
+ * In the rare case of index wrap-around, 0 will be returned. 0 will also
+ * be returned if index == 0 and there is a gap at the index. We can not
+ * wrap-around if passed index == 0.
*/
pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
@@ -1770,12 +1772,13 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
while (max_scan--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
- break;
- if (xas.xa_index == 0)
- break;
+ return xas.xa_index;
+ if (xas.xa_index == 0 && index != 0)
+ return xas.xa_index;
}
- return xas.xa_index;
+ /* No gaps in range and no wrap-around, return index beyond range */
+ return xas.xa_index + 1;
}
EXPORT_SYMBOL(page_cache_next_miss);
@@ -1796,7 +1799,9 @@ EXPORT_SYMBOL(page_cache_next_miss);
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'index - return >= max_scan' will be true).
- * In the rare case of wrap-around, ULONG_MAX will be returned.
+ * In the rare case of wrap-around, ULONG_MAX will be returned. ULONG_MAX
+ * will also be returned if index == ULONG_MAX and there is a gap at the
+ * index. We can not wrap-around if passed index == ULONG_MAX.
*/
pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
@@ -1806,12 +1811,13 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
while (max_scan--) {
void *entry = xas_prev(&xas);
if (!entry || xa_is_value(entry))
- break;
- if (xas.xa_index == ULONG_MAX)
- break;
+ return xas.xa_index;
+ if (xas.xa_index == ULONG_MAX && index != ULONG_MAX)
+ return xas.xa_index;
}
- return xas.xa_index;
+ /* No gaps in range and no wrap-around, return index beyond range */
+ return xas.xa_index - 1;
}
EXPORT_SYMBOL(page_cache_prev_miss);
@@ -2687,8 +2693,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;
- error = filemap_get_pages(iocb, iter->count, &fbatch,
- iov_iter_is_pipe(iter));
+ error = filemap_get_pages(iocb, iter->count, &fbatch, false);
if (error < 0)
break;
@@ -2872,9 +2877,24 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
return spliced;
}
-/*
- * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into
- * a pipe.
+/**
+ * filemap_splice_read - Splice data from a file's pagecache into a pipe
+ * @in: The file to read from
+ * @ppos: Pointer to the file position to read from
+ * @pipe: The pipe to splice into
+ * @len: The amount to splice
+ * @flags: The SPLICE_F_* flags
+ *
+ * This function gets folios from a file's pagecache and splices them into the
+ * pipe. Readahead will be called as necessary to fill more folios. This may
+ * be used for blockdevs also.
+ *
+ * Return: On success, the number of bytes read will be returned and *@ppos
+ * will be updated if appropriate; 0 will be returned if there is no more data
+ * to be read; -EAGAIN will be returned if the pipe had no space, and some
+ * other negative error code will be returned on error. A short read may occur
+ * if the pipe has insufficient space, we reach the end of the data or we hit a
+ * hole.
*/
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
@@ -2887,6 +2907,9 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
bool writably_mapped;
int i, error = 0;
+ if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
+ return 0;
+
init_sync_kiocb(&iocb, in);
iocb.ki_pos = *ppos;
@@ -2900,7 +2923,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
do {
cond_resched();
- if (*ppos >= i_size_read(file_inode(in)))
+ if (*ppos >= i_size_read(in->f_mapping->host))
break;
iocb.ki_pos = *ppos;
@@ -2916,7 +2939,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
- isize = i_size_read(file_inode(in));
+ isize = i_size_read(in->f_mapping->host);
if (unlikely(*ppos >= isize))
break;
end_offset = min_t(loff_t, isize, *ppos + len);
diff --git a/mm/gup.c b/mm/gup.c
index bbe416236593..0814576b7366 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -51,7 +51,8 @@ static inline void sanity_check_pinned_pages(struct page **pages,
struct page *page = *pages;
struct folio *folio = page_folio(page);
- if (!folio_test_anon(folio))
+ if (is_zero_page(page) ||
+ !folio_test_anon(folio))
continue;
if (!folio_test_large(folio) || folio_test_hugetlb(folio))
VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
@@ -132,6 +133,13 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
struct folio *folio;
/*
+ * Don't take a pin on the zero page - it's not going anywhere
+ * and it is used in a *lot* of places.
+ */
+ if (is_zero_page(page))
+ return page_folio(page);
+
+ /*
* Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
* right zone, so fail and let the caller fall back to the slow
* path.
@@ -180,6 +188,8 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
{
if (flags & FOLL_PIN) {
+ if (is_zero_folio(folio))
+ return;
node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
if (folio_test_large(folio))
atomic_sub(refs, &folio->_pincount);
@@ -225,6 +235,13 @@ int __must_check try_grab_page(struct page *page, unsigned int flags)
folio_ref_inc(folio);
else if (flags & FOLL_PIN) {
/*
+ * Don't take a pin on the zero page - it's not going anywhere
+ * and it is used in a *lot* of places.
+ */
+ if (is_zero_page(page))
+ return 0;
+
+ /*
* Similar to try_grab_folio(): be sure to *also*
* increment the normal page refcount field at least once,
* so that the page really is pinned.
@@ -258,6 +275,33 @@ void unpin_user_page(struct page *page)
}
EXPORT_SYMBOL(unpin_user_page);
+/**
+ * folio_add_pin - Try to get an additional pin on a pinned folio
+ * @folio: The folio to be pinned
+ *
+ * Get an additional pin on a folio we already have a pin on. Makes no change
+ * if the folio is a zero_page.
+ */
+void folio_add_pin(struct folio *folio)
+{
+ if (is_zero_folio(folio))
+ return;
+
+ /*
+ * Similar to try_grab_folio(): be sure to *also* increment the normal
+ * page refcount field at least once, so that the page really is
+ * pinned.
+ */
+ if (folio_test_large(folio)) {
+ WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
+ folio_ref_inc(folio);
+ atomic_inc(&folio->_pincount);
+ } else {
+ WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
+ folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
+ }
+}
+
static inline struct folio *gup_folio_range_next(struct page *start,
unsigned long npages, unsigned long i, unsigned int *ntails)
{
@@ -3079,6 +3123,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for further details.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page() will not remove pins from it.
*/
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
@@ -3110,6 +3157,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@@ -3143,6 +3193,9 @@ EXPORT_SYMBOL(pin_user_pages_remote);
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -3161,6 +3214,9 @@ EXPORT_SYMBOL(pin_user_pages);
* pin_user_pages_unlocked() is the FOLL_PIN variant of
* get_user_pages_unlocked(). Behavior is the same, except that this one sets
* FOLL_PIN and rejects FOLL_GET.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
diff --git a/mm/gup_test.c b/mm/gup_test.c
index 8ae7307a1bb6..c0421b786dcd 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -381,6 +381,7 @@ static int gup_test_release(struct inode *inode, struct file *file)
static const struct file_operations gup_test_fops = {
.open = nonseekable_open,
.unlocked_ioctl = gup_test_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.release = gup_test_release,
};
diff --git a/mm/internal.h b/mm/internal.h
index 68410c6d97ac..e6029d94bdb2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -179,12 +179,6 @@ extern unsigned long highest_memmap_pfn;
#define MAX_RECLAIM_RETRIES 16
/*
- * in mm/early_ioremap.c
- */
-pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
- unsigned long size, pgprot_t prot);
-
-/*
* in mm/vmscan.c:
*/
bool isolate_lru_page(struct page *page);
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 2aafc46a4aaf..392fb273e7bd 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -29,7 +29,7 @@
* canary of every 8 bytes is the same. 64-bit memory can be filled and checked
* at a time instead of byte by byte to improve performance.
*/
-#define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(0x0706050403020100))
+#define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(le64_to_cpu(0x0706050403020100)))
/* Maximum stack depth for reports. */
#define KFENCE_STACK_DEPTH 64
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6b9d39d65b73..2d0d58fb4e7f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2070,7 +2070,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
xas_lock_irq(&xas);
- xas_set(&xas, index);
VM_BUG_ON_PAGE(page != xas_load(&xas), page);
diff --git a/mm/memblock.c b/mm/memblock.c
index 3feafea06ab2..50b921119600 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1436,6 +1436,15 @@ done:
*/
kmemleak_alloc_phys(found, size, 0);
+ /*
+ * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
+ * require memory to be accepted before it can be used by the
+ * guest.
+ *
+ * Accept the memory of the allocated buffer.
+ */
+ accept_memory(found, found + size);
+
return found;
}
diff --git a/mm/memfd.c b/mm/memfd.c
index 69b90c31d38c..e763e76f1106 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -371,12 +371,15 @@ SYSCALL_DEFINE2(memfd_create,
inode->i_mode &= ~0111;
file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
- *file_seals |= F_SEAL_EXEC;
+ if (file_seals) {
+ *file_seals &= ~F_SEAL_SEAL;
+ *file_seals |= F_SEAL_EXEC;
+ }
} else if (flags & MFD_ALLOW_SEALING) {
/* MFD_EXEC and MFD_ALLOW_SEALING are set */
file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
+ if (file_seals)
+ *file_seals &= ~F_SEAL_SEAL;
}
fd_install(fd, file);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 7f7f9c677854..1cfc08e25f93 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1375,6 +1375,10 @@ static void __meminit zone_init_free_lists(struct zone *zone)
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ INIT_LIST_HEAD(&zone->unaccepted_pages);
+#endif
}
void __meminit init_currently_empty_zone(struct zone *zone,
@@ -1960,6 +1964,9 @@ static void __init deferred_free_range(unsigned long pfn,
return;
}
+ /* Accept chunks smaller than MAX_ORDER upfront */
+ accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
+
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
diff --git a/mm/mmap.c b/mm/mmap.c
index 13678edaa22c..d600404580b2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2318,21 +2318,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
return __split_vma(vmi, vma, addr, new_below);
}
-static inline int munmap_sidetree(struct vm_area_struct *vma,
- struct ma_state *mas_detach)
-{
- vma_start_write(vma);
- mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
- if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
- return -ENOMEM;
-
- vma_mark_detached(vma, true);
- if (vma->vm_flags & VM_LOCKED)
- vma->vm_mm->locked_vm -= vma_pages(vma);
-
- return 0;
-}
-
/*
* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
* @vmi: The vma iterator
@@ -2354,6 +2339,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct maple_tree mt_detach;
int count = 0;
int error = -ENOMEM;
+ unsigned long locked_vm = 0;
MA_STATE(mas_detach, &mt_detach, 0, 0);
mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
@@ -2399,9 +2385,13 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (error)
goto end_split_failed;
}
- error = munmap_sidetree(next, &mas_detach);
- if (error)
- goto munmap_sidetree_failed;
+ vma_start_write(next);
+ mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
+ if (mas_store_gfp(&mas_detach, next, GFP_KERNEL))
+ goto munmap_gather_failed;
+ vma_mark_detached(next, true);
+ if (next->vm_flags & VM_LOCKED)
+ locked_vm += vma_pages(next);
count++;
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@@ -2447,10 +2437,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
#endif
/* Point of no return */
+ error = -ENOMEM;
vma_iter_set(vmi, start);
if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL))
- return -ENOMEM;
+ goto clear_tree_failed;
+ mm->locked_vm -= locked_vm;
mm->map_count -= count;
/*
* Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
@@ -2480,9 +2472,14 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
validate_mm(mm);
return downgrade ? 1 : 0;
+clear_tree_failed:
userfaultfd_error:
-munmap_sidetree_failed:
+munmap_gather_failed:
end_split_failed:
+ mas_set(&mas_detach, 0);
+ mas_for_each(&mas_detach, next, end)
+ vma_mark_detached(next, false);
+
__mt_destroy(&mt_detach);
start_split_failed:
map_count_exceeded:
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 92d3d3ca390a..c59e7561698c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -867,7 +867,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
}
tlb_finish_mmu(&tlb);
- if (!error && vma_iter_end(&vmi) < end)
+ if (!error && tmp < end)
error = -ENOMEM;
out:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47421bedc12b..d239fba3f31c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -387,6 +387,12 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
+static bool page_contains_unaccepted(struct page *page, unsigned int order);
+static void accept_page(struct page *page, unsigned int order);
+static bool try_to_accept_memory(struct zone *zone, unsigned int order);
+static inline bool has_unaccepted_memory(void);
+static bool __free_unaccepted(struct page *page);
+
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1481,6 +1487,13 @@ void __free_pages_core(struct page *page, unsigned int order)
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
+ if (page_contains_unaccepted(page, order)) {
+ if (order == MAX_ORDER && __free_unaccepted(page))
+ return;
+
+ accept_page(page, order);
+ }
+
/*
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
@@ -3159,6 +3172,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ unusable_free += zone_page_state(z, NR_UNACCEPTED);
+#endif
return unusable_free;
}
@@ -3458,6 +3474,11 @@ retry:
gfp_mask)) {
int ret;
+ if (has_unaccepted_memory()) {
+ if (try_to_accept_memory(zone, order))
+ goto try_this_zone;
+ }
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
@@ -3510,6 +3531,11 @@ try_this_zone:
return page;
} else {
+ if (has_unaccepted_memory()) {
+ if (try_to_accept_memory(zone, order))
+ goto try_this_zone;
+ }
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (deferred_pages_enabled()) {
@@ -7215,3 +7241,150 @@ bool has_managed_dma(void)
return false;
}
#endif /* CONFIG_ZONE_DMA */
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+
+/* Counts number of zones with unaccepted pages. */
+static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
+
+static bool lazy_accept = true;
+
+static int __init accept_memory_parse(char *p)
+{
+ if (!strcmp(p, "lazy")) {
+ lazy_accept = true;
+ return 0;
+ } else if (!strcmp(p, "eager")) {
+ lazy_accept = false;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+early_param("accept_memory", accept_memory_parse);
+
+static bool page_contains_unaccepted(struct page *page, unsigned int order)
+{
+ phys_addr_t start = page_to_phys(page);
+ phys_addr_t end = start + (PAGE_SIZE << order);
+
+ return range_contains_unaccepted_memory(start, end);
+}
+
+static void accept_page(struct page *page, unsigned int order)
+{
+ phys_addr_t start = page_to_phys(page);
+
+ accept_memory(start, start + (PAGE_SIZE << order));
+}
+
+static bool try_to_accept_memory_one(struct zone *zone)
+{
+ unsigned long flags;
+ struct page *page;
+ bool last;
+
+ if (list_empty(&zone->unaccepted_pages))
+ return false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ page = list_first_entry_or_null(&zone->unaccepted_pages,
+ struct page, lru);
+ if (!page) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return false;
+ }
+
+ list_del(&page->lru);
+ last = list_empty(&zone->unaccepted_pages);
+
+ __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ accept_page(page, MAX_ORDER);
+
+ __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
+
+ if (last)
+ static_branch_dec(&zones_with_unaccepted_pages);
+
+ return true;
+}
+
+static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+{
+ long to_accept;
+ int ret = false;
+
+ /* How much to accept to get to high watermark? */
+ to_accept = high_wmark_pages(zone) -
+ (zone_page_state(zone, NR_FREE_PAGES) -
+ __zone_watermark_unusable_free(zone, order, 0));
+
+ /* Accept at least one page */
+ do {
+ if (!try_to_accept_memory_one(zone))
+ break;
+ ret = true;
+ to_accept -= MAX_ORDER_NR_PAGES;
+ } while (to_accept > 0);
+
+ return ret;
+}
+
+static inline bool has_unaccepted_memory(void)
+{
+ return static_branch_unlikely(&zones_with_unaccepted_pages);
+}
+
+static bool __free_unaccepted(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long flags;
+ bool first = false;
+
+ if (!lazy_accept)
+ return false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ first = list_empty(&zone->unaccepted_pages);
+ list_add_tail(&page->lru, &zone->unaccepted_pages);
+ __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ if (first)
+ static_branch_inc(&zones_with_unaccepted_pages);
+
+ return true;
+}
+
+#else
+
+static bool page_contains_unaccepted(struct page *page, unsigned int order)
+{
+ return false;
+}
+
+static void accept_page(struct page *page, unsigned int order)
+{
+}
+
+static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+{
+ return false;
+}
+
+static inline bool has_unaccepted_memory(void)
+{
+ return false;
+}
+
+static bool __free_unaccepted(struct page *page)
+{
+ BUILD_BUG();
+ return false;
+}
+
+#endif /* CONFIG_UNACCEPTED_MEMORY */
diff --git a/mm/page_io.c b/mm/page_io.c
index 87b682d18850..684cd3c7b59b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,7 +338,7 @@ static void swap_writepage_bdev_sync(struct page *page,
bio_init(&bio, sis->bdev, &bv, 1,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
bio.bi_iter.bi_sector = swap_page_sector(page);
- bio_add_page(&bio, page, thp_size(page), 0);
+ __bio_add_page(&bio, page, thp_size(page), 0);
bio_associate_blkg_from_page(&bio, page);
count_swpout_vm_event(page);
@@ -360,7 +360,7 @@ static void swap_writepage_bdev_async(struct page *page,
GFP_NOIO);
bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_end_io = end_swap_bio_write;
- bio_add_page(bio, page, thp_size(page), 0);
+ __bio_add_page(bio, page, thp_size(page), 0);
bio_associate_blkg_from_page(bio, page);
count_swpout_vm_event(page);
@@ -468,7 +468,7 @@ static void swap_readpage_bdev_sync(struct page *page,
bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = swap_page_sector(page);
- bio_add_page(&bio, page, thp_size(page), 0);
+ __bio_add_page(&bio, page, thp_size(page), 0);
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
@@ -488,7 +488,7 @@ static void swap_readpage_bdev_async(struct page *page,
bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_end_io = end_swap_bio_read;
- bio_add_page(bio, page, thp_size(page), 0);
+ __bio_add_page(bio, page, thp_size(page), 0);
count_vm_event(PSWPIN);
submit_bio(bio);
}
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 25d8610c0042..f2baf97d5f38 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -71,6 +71,8 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
+
+ BUG_ON(PageSlab(page));
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -107,6 +109,8 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
+
+ BUG_ON(PageSlab(page));
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -133,6 +137,8 @@ void __page_table_check_zero(struct page *page, unsigned int order)
struct page_ext *page_ext;
unsigned long i;
+ BUG_ON(PageSlab(page));
+
page_ext = page_ext_get(page);
BUG_ON(!page_ext);
for (i = 0; i < (1ul << order); i++) {
diff --git a/mm/shmem.c b/mm/shmem.c
index e40a08c5c6d7..1f504ed982cf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2731,6 +2731,138 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
+static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return true;
+}
+
+static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+}
+
+static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return false;
+}
+
+static const struct pipe_buf_operations zero_pipe_buf_ops = {
+ .release = zero_pipe_buf_release,
+ .try_steal = zero_pipe_buf_try_steal,
+ .get = zero_pipe_buf_get,
+};
+
+static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
+ loff_t fpos, size_t size)
+{
+ size_t offset = fpos & ~PAGE_MASK;
+
+ size = min_t(size_t, size, PAGE_SIZE - offset);
+
+ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
+
+ *buf = (struct pipe_buffer) {
+ .ops = &zero_pipe_buf_ops,
+ .page = ZERO_PAGE(0),
+ .offset = offset,
+ .len = size,
+ };
+ pipe->head++;
+ }
+
+ return size;
+}
+
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode = file_inode(in);
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio = NULL;
+ size_t total_spliced = 0, used, npages, n, part;
+ loff_t isize;
+ int error = 0;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ do {
+ if (*ppos >= i_size_read(inode))
+ break;
+
+ error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ);
+ if (error) {
+ if (error == -EINVAL)
+ error = 0;
+ break;
+ }
+ if (folio) {
+ folio_unlock(folio);
+
+ if (folio_test_hwpoison(folio)) {
+ error = -EIO;
+ break;
+ }
+ }
+
+ /*
+ * i_size must be checked after we know the pages are Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(inode);
+ if (unlikely(*ppos >= isize))
+ break;
+ part = min_t(loff_t, isize - *ppos, len);
+
+ if (folio) {
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+ folio_mark_accessed(folio);
+ /*
+ * Ok, we have the page, and it's up-to-date, so we can
+ * now splice it into the pipe.
+ */
+ n = splice_folio_into_pipe(pipe, folio, *ppos, part);
+ folio_put(folio);
+ folio = NULL;
+ } else {
+ n = splice_zeropage_into_pipe(pipe, *ppos, len);
+ }
+
+ if (!n)
+ break;
+ len -= n;
+ total_spliced += n;
+ *ppos += n;
+ in->f_ra.prev_pos = *ppos;
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ break;
+
+ cond_resched();
+ } while (len);
+
+ if (folio)
+ folio_put(folio);
+
+ file_accessed(in);
+ return total_spliced ? total_spliced : error;
+}
+
static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
struct address_space *mapping = file->f_mapping;
@@ -3971,7 +4103,7 @@ static const struct file_operations shmem_file_operations = {
.read_iter = shmem_file_read_iter,
.write_iter = generic_file_write_iter,
.fsync = noop_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = shmem_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = shmem_fallocate,
#endif
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 3f83b10c5031..3ab53fad8876 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -5,12 +5,10 @@
#include <linux/seq_file.h>
#include <linux/shrinker.h>
#include <linux/memcontrol.h>
-#include <linux/srcu.h>
/* defined in vmscan.c */
-extern struct mutex shrinker_mutex;
+extern struct rw_semaphore shrinker_rwsem;
extern struct list_head shrinker_list;
-extern struct srcu_struct shrinker_srcu;
static DEFINE_IDA(shrinker_debugfs_ida);
static struct dentry *shrinker_debugfs_root;
@@ -51,13 +49,18 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
struct mem_cgroup *memcg;
unsigned long total;
bool memcg_aware;
- int ret = 0, nid, srcu_idx;
+ int ret, nid;
count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
if (!count_per_node)
return -ENOMEM;
- srcu_idx = srcu_read_lock(&shrinker_srcu);
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ kfree(count_per_node);
+ return ret;
+ }
+ rcu_read_lock();
memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
@@ -88,7 +91,8 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
}
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ rcu_read_unlock();
+ up_read(&shrinker_rwsem);
kfree(count_per_node);
return ret;
@@ -111,8 +115,9 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
.gfp_mask = GFP_KERNEL,
};
struct mem_cgroup *memcg = NULL;
- int nid, srcu_idx;
+ int nid;
char kbuf[72];
+ ssize_t ret;
read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1);
if (copy_from_user(kbuf, buf, read_len))
@@ -141,7 +146,11 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
return -EINVAL;
}
- srcu_idx = srcu_read_lock(&shrinker_srcu);
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ mem_cgroup_put(memcg);
+ return ret;
+ }
sc.nid = nid;
sc.memcg = memcg;
@@ -150,7 +159,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
shrinker->scan_objects(shrinker, &sc);
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ up_read(&shrinker_rwsem);
mem_cgroup_put(memcg);
return size;
@@ -168,7 +177,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
char buf[128];
int id;
- lockdep_assert_held(&shrinker_mutex);
+ lockdep_assert_held(&shrinker_rwsem);
/* debugfs isn't initialized yet, add debugfs entries later. */
if (!shrinker_debugfs_root)
@@ -211,7 +220,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
if (!new)
return -ENOMEM;
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
old = shrinker->name;
shrinker->name = new;
@@ -229,7 +238,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
shrinker->debugfs_entry = entry;
}
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
kfree_const(old);
@@ -237,23 +246,28 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
}
EXPORT_SYMBOL(shrinker_debugfs_rename);
-struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker)
+struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
+ int *debugfs_id)
{
struct dentry *entry = shrinker->debugfs_entry;
- lockdep_assert_held(&shrinker_mutex);
+ lockdep_assert_held(&shrinker_rwsem);
kfree_const(shrinker->name);
shrinker->name = NULL;
- if (entry) {
- ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
- shrinker->debugfs_entry = NULL;
- }
+ *debugfs_id = entry ? shrinker->debugfs_id : -1;
+ shrinker->debugfs_entry = NULL;
return entry;
}
+void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id)
+{
+ debugfs_remove_recursive(debugfs_entry);
+ ida_free(&shrinker_debugfs_ida, debugfs_id);
+}
+
static int __init shrinker_debugfs_init(void)
{
struct shrinker *shrinker;
@@ -266,14 +280,14 @@ static int __init shrinker_debugfs_init(void)
shrinker_debugfs_root = dentry;
/* Create debugfs entries for shrinkers registered at boot */
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
list_for_each_entry(shrinker, &shrinker_list, list)
if (!shrinker->debugfs_entry) {
ret = shrinker_debugfs_add(shrinker);
if (ret)
break;
}
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
return ret;
}
diff --git a/mm/slab.h b/mm/slab.h
index f01ac256a8f5..bc36edd5ba4f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -6,6 +6,38 @@
*/
void __init kmem_cache_init(void);
+#ifdef CONFIG_64BIT
+# ifdef system_has_cmpxchg128
+# define system_has_freelist_aba() system_has_cmpxchg128()
+# define try_cmpxchg_freelist try_cmpxchg128
+# endif
+#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128
+typedef u128 freelist_full_t;
+#else /* CONFIG_64BIT */
+# ifdef system_has_cmpxchg64
+# define system_has_freelist_aba() system_has_cmpxchg64()
+# define try_cmpxchg_freelist try_cmpxchg64
+# endif
+#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64
+typedef u64 freelist_full_t;
+#endif /* CONFIG_64BIT */
+
+#if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#undef system_has_freelist_aba
+#endif
+
+/*
+ * Freelist pointer and counter to cmpxchg together, avoids the typical ABA
+ * problems with cmpxchg of just a pointer.
+ */
+typedef union {
+ struct {
+ void *freelist;
+ unsigned long counter;
+ };
+ freelist_full_t full;
+} freelist_aba_t;
+
/* Reuses the bits in struct page */
struct slab {
unsigned long __page_flags;
@@ -38,14 +70,21 @@ struct slab {
#endif
};
/* Double-word boundary */
- void *freelist; /* first free object */
union {
- unsigned long counters;
struct {
- unsigned inuse:16;
- unsigned objects:15;
- unsigned frozen:1;
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
};
+#ifdef system_has_freelist_aba
+ freelist_aba_t freelist_counter;
+#endif
};
};
struct rcu_head rcu_head;
@@ -72,8 +111,8 @@ SLAB_MATCH(memcg_data, memcg_data);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB)
-static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *)));
+#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB)
+static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)));
#endif
/**
diff --git a/mm/slub.c b/mm/slub.c
index c87628cd8a9a..7529626bbec2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -292,7 +292,12 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
/* Poison object */
#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
/* Use cmpxchg_double */
+
+#ifdef system_has_freelist_aba
#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
+#else
+#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U)
+#endif
/*
* Tracking user of a slab.
@@ -512,6 +517,40 @@ static __always_inline void slab_unlock(struct slab *slab)
__bit_spin_unlock(PG_locked, &page->flags);
}
+static inline bool
+__update_freelist_fast(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+#ifdef system_has_freelist_aba
+ freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
+
+ return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
+#else
+ return false;
+#endif
+}
+
+static inline bool
+__update_freelist_slow(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+ bool ret = false;
+
+ slab_lock(slab);
+ if (slab->freelist == freelist_old &&
+ slab->counters == counters_old) {
+ slab->freelist = freelist_new;
+ slab->counters = counters_new;
+ ret = true;
+ }
+ slab_unlock(slab);
+
+ return ret;
+}
+
/*
* Interrupts must be disabled (for the fallback code to work right), typically
* by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
@@ -519,33 +558,25 @@ static __always_inline void slab_unlock(struct slab *slab)
* allocation/ free operation in hardirq context. Therefore nothing can
* interrupt the operation.
*/
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
+ bool ret;
+
if (USE_LOCKLESS_FAST_PATH())
lockdep_assert_irqs_disabled();
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&slab->freelist, &slab->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
- slab_lock(slab);
- if (slab->freelist == freelist_old &&
- slab->counters == counters_old) {
- slab->freelist = freelist_new;
- slab->counters = counters_new;
- slab_unlock(slab);
- return true;
- }
- slab_unlock(slab);
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
}
+ if (likely(ret))
+ return true;
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -557,36 +588,26 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab
return false;
}
-static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ bool ret;
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&slab->freelist, &slab->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
unsigned long flags;
local_irq_save(flags);
- slab_lock(slab);
- if (slab->freelist == freelist_old &&
- slab->counters == counters_old) {
- slab->freelist = freelist_new;
- slab->counters = counters_new;
- slab_unlock(slab);
- local_irq_restore(flags);
- return true;
- }
- slab_unlock(slab);
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
local_irq_restore(flags);
}
+ if (likely(ret))
+ return true;
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -2228,7 +2249,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
VM_BUG_ON(new.frozen);
new.frozen = 1;
- if (!__cmpxchg_double_slab(s, slab,
+ if (!__slab_update_freelist(s, slab,
freelist, counters,
new.freelist, new.counters,
"acquire_slab"))
@@ -2554,7 +2575,7 @@ redo:
}
- if (!cmpxchg_double_slab(s, slab,
+ if (!slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab")) {
@@ -2611,7 +2632,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
new.frozen = 0;
- } while (!__cmpxchg_double_slab(s, slab,
+ } while (!__slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"));
@@ -3008,6 +3029,18 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
}
#ifndef CONFIG_SLUB_TINY
+static inline bool
+__update_cpu_freelist_fast(struct kmem_cache *s,
+ void *freelist_old, void *freelist_new,
+ unsigned long tid)
+{
+ freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
+
+ return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
+ &old.full, new.full);
+}
+
/*
* Check the slab->freelist and either transfer the freelist to the
* per cpu freelist or deactivate the slab.
@@ -3034,7 +3067,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
new.inuse = slab->objects;
new.frozen = freelist != NULL;
- } while (!__cmpxchg_double_slab(s, slab,
+ } while (!__slab_update_freelist(s, slab,
freelist, counters,
NULL, new.counters,
"get_freelist"));
@@ -3359,11 +3392,7 @@ redo:
* against code executing on this cpu *not* from access by
* other cpus.
*/
- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- object, tid,
- next_object, next_tid(tid)))) {
-
+ if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
@@ -3631,7 +3660,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
}
}
- } while (!cmpxchg_double_slab(s, slab,
+ } while (!slab_update_freelist(s, slab,
prior, counters,
head, new.counters,
"__slab_free"));
@@ -3736,11 +3765,7 @@ redo:
set_freepointer(s, tail_obj, freelist);
- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- freelist, tid,
- head, next_tid(tid)))) {
-
+ if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
@@ -4505,11 +4530,11 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
}
}
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
+#ifdef system_has_freelist_aba
+ if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
+ }
#endif
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 274bbf797480..6bc83060df9a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2539,7 +2539,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct block_device *bdev = I_BDEV(inode);
set_blocksize(bdev, old_block_size);
- blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(bdev, p);
}
inode_lock(inode);
@@ -2770,7 +2770,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
if (S_ISBLK(inode->i_mode)) {
p->bdev = blkdev_get_by_dev(inode->i_rdev,
- FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
+ BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
if (IS_ERR(p->bdev)) {
error = PTR_ERR(p->bdev);
p->bdev = NULL;
@@ -3221,7 +3221,7 @@ bad_swap:
p->cluster_next_cpu = NULL;
if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
set_blocksize(p->bdev, p->old_block_size);
- blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(p->bdev, p);
}
inode = NULL;
destroy_swap_extents(p);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9683573f1225..1d13d71687d7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3098,11 +3098,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* allocation request, free them via vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
- /* vm_area_alloc_pages() can also fail due to a fatal signal */
- if (!fatal_signal_pending(current))
+ /*
+ * vm_area_alloc_pages() can fail due to insufficient memory but
+ * also:-
+ *
+ * - a pending fatal signal
+ * - insufficient huge page-order pages
+ *
+ * Since we always retry allocations at order-0 in the huge page
+ * case a warning for either is spurious.
+ */
+ if (!fatal_signal_pending(current) && page_order == 0)
warn_alloc(gfp_mask, NULL,
- "vmalloc error: size %lu, page order %u, failed to allocate pages",
- area->nr_pages * PAGE_SIZE, page_order);
+ "vmalloc error: size %lu, failed to allocate pages",
+ area->nr_pages * PAGE_SIZE);
goto fail;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d257916f39e5..5bf98d0a22c9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,7 +35,7 @@
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
-#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -57,7 +57,6 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
-#include <linux/srcu.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -190,9 +189,7 @@ struct scan_control {
int vm_swappiness = 60;
LIST_HEAD(shrinker_list);
-DEFINE_MUTEX(shrinker_mutex);
-DEFINE_SRCU(shrinker_srcu);
-static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
+DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG
static int shrinker_nr_max;
@@ -211,21 +208,8 @@ static inline int shrinker_defer_size(int nr_items)
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
int nid)
{
- return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
- &shrinker_srcu,
- lockdep_is_held(&shrinker_mutex));
-}
-
-static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
- int nid)
-{
- return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
- &shrinker_srcu);
-}
-
-static void free_shrinker_info_rcu(struct rcu_head *head)
-{
- kvfree(container_of(head, struct shrinker_info, rcu));
+ return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+ lockdep_is_held(&shrinker_rwsem));
}
static int expand_one_shrinker_info(struct mem_cgroup *memcg,
@@ -266,7 +250,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
defer_size - old_defer_size);
rcu_assign_pointer(pn->shrinker_info, new);
- call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
+ kvfree_rcu(old, rcu);
}
return 0;
@@ -292,7 +276,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
int nid, size, ret = 0;
int map_size, defer_size = 0;
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
map_size = shrinker_map_size(shrinker_nr_max);
defer_size = shrinker_defer_size(shrinker_nr_max);
size = map_size + defer_size;
@@ -308,7 +292,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
info->map_nr_max = shrinker_nr_max;
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
}
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
return ret;
}
@@ -324,7 +308,7 @@ static int expand_shrinker_info(int new_id)
if (!root_mem_cgroup)
goto out;
- lockdep_assert_held(&shrinker_mutex);
+ lockdep_assert_held(&shrinker_rwsem);
map_size = shrinker_map_size(new_nr_max);
defer_size = shrinker_defer_size(new_nr_max);
@@ -352,16 +336,15 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
- int srcu_idx;
- srcu_idx = srcu_read_lock(&shrinker_srcu);
- info = shrinker_info_srcu(memcg, nid);
+ rcu_read_lock();
+ info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id, info->map);
}
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ rcu_read_unlock();
}
}
@@ -374,7 +357,8 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
if (mem_cgroup_disabled())
return -ENOSYS;
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
+ /* This may call shrinker, so it must use down_read_trylock() */
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
@@ -388,7 +372,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
shrinker->id = id;
ret = 0;
unlock:
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
return ret;
}
@@ -398,7 +382,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
BUG_ON(id < 0);
- lockdep_assert_held(&shrinker_mutex);
+ lockdep_assert_held(&shrinker_rwsem);
idr_remove(&shrinker_idr, id);
}
@@ -408,7 +392,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_srcu(memcg, nid);
+ info = shrinker_info_protected(memcg, nid);
return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
}
@@ -417,7 +401,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_srcu(memcg, nid);
+ info = shrinker_info_protected(memcg, nid);
return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
}
@@ -433,7 +417,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
parent = root_mem_cgroup;
/* Prevent from concurrent shrinker_info expand */
- mutex_lock(&shrinker_mutex);
+ down_read(&shrinker_rwsem);
for_each_node(nid) {
child_info = shrinker_info_protected(memcg, nid);
parent_info = shrinker_info_protected(parent, nid);
@@ -442,7 +426,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
atomic_long_add(nr, &parent_info->nr_deferred[i]);
}
}
- mutex_unlock(&shrinker_mutex);
+ up_read(&shrinker_rwsem);
}
static bool cgroup_reclaim(struct scan_control *sc)
@@ -743,9 +727,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
shrinker->name = NULL;
#endif
if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
unregister_memcg_shrinker(shrinker);
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
return;
}
@@ -755,11 +739,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
void register_shrinker_prepared(struct shrinker *shrinker)
{
- mutex_lock(&shrinker_mutex);
- list_add_tail_rcu(&shrinker->list, &shrinker_list);
+ down_write(&shrinker_rwsem);
+ list_add_tail(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
shrinker_debugfs_add(shrinker);
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
}
static int __register_shrinker(struct shrinker *shrinker)
@@ -805,22 +789,20 @@ EXPORT_SYMBOL(register_shrinker);
void unregister_shrinker(struct shrinker *shrinker)
{
struct dentry *debugfs_entry;
+ int debugfs_id;
if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
- mutex_lock(&shrinker_mutex);
- list_del_rcu(&shrinker->list);
+ down_write(&shrinker_rwsem);
+ list_del(&shrinker->list);
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
- debugfs_entry = shrinker_debugfs_remove(shrinker);
- mutex_unlock(&shrinker_mutex);
+ debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
+ up_write(&shrinker_rwsem);
- atomic_inc(&shrinker_srcu_generation);
- synchronize_srcu(&shrinker_srcu);
-
- debugfs_remove_recursive(debugfs_entry);
+ shrinker_debugfs_remove(debugfs_entry, debugfs_id);
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
@@ -830,13 +812,15 @@ EXPORT_SYMBOL(unregister_shrinker);
/**
* synchronize_shrinkers - Wait for all running shrinkers to complete.
*
- * This is useful to guarantee that all shrinker invocations have seen an
- * update, before freeing memory.
+ * This is equivalent to calling unregister_shrink() and register_shrinker(),
+ * but atomically and with less overhead. This is useful to guarantee that all
+ * shrinker invocations have seen an update, before freeing memory, similar to
+ * rcu.
*/
void synchronize_shrinkers(void)
{
- atomic_inc(&shrinker_srcu_generation);
- synchronize_srcu(&shrinker_srcu);
+ down_write(&shrinker_rwsem);
+ up_write(&shrinker_rwsem);
}
EXPORT_SYMBOL(synchronize_shrinkers);
@@ -945,20 +929,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
- int srcu_idx, generation;
- int i = 0;
+ int i;
if (!mem_cgroup_online(memcg))
return 0;
-again:
- srcu_idx = srcu_read_lock(&shrinker_srcu);
- info = shrinker_info_srcu(memcg, nid);
+ if (!down_read_trylock(&shrinker_rwsem))
+ return 0;
+
+ info = shrinker_info_protected(memcg, nid);
if (unlikely(!info))
goto unlock;
- generation = atomic_read(&shrinker_srcu_generation);
- for_each_set_bit_from(i, info->map, info->map_nr_max) {
+ for_each_set_bit(i, info->map, info->map_nr_max) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -1004,14 +987,14 @@ again:
set_shrinker_bit(memcg, nid, i);
}
freed += ret;
- if (atomic_read(&shrinker_srcu_generation) != generation) {
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
- i++;
- goto again;
+
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
}
}
unlock:
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ up_read(&shrinker_rwsem);
return freed;
}
#else /* CONFIG_MEMCG */
@@ -1048,7 +1031,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{
unsigned long ret, freed = 0;
struct shrinker *shrinker;
- int srcu_idx, generation;
/*
* The root memcg might be allocated even though memcg is disabled
@@ -1060,11 +1042,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
- srcu_idx = srcu_read_lock(&shrinker_srcu);
+ if (!down_read_trylock(&shrinker_rwsem))
+ goto out;
- generation = atomic_read(&shrinker_srcu_generation);
- list_for_each_entry_srcu(shrinker, &shrinker_list, list,
- srcu_read_lock_held(&shrinker_srcu)) {
+ list_for_each_entry(shrinker, &shrinker_list, list) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -1075,14 +1056,19 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (ret == SHRINK_EMPTY)
ret = 0;
freed += ret;
-
- if (atomic_read(&shrinker_srcu_generation) != generation) {
+ /*
+ * Bail out if someone want to register a new shrinker to
+ * prevent the registration from being stalled for long periods
+ * by parallel ongoing shrinking.
+ */
+ if (rwsem_is_contended(&shrinker_rwsem)) {
freed = freed ? : 1;
break;
}
}
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ up_read(&shrinker_rwsem);
+out:
cond_resched();
return freed;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c28046371b45..282349cabf01 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1180,6 +1180,9 @@ const char * const vmstat_text[] = {
"nr_zspages",
#endif
"nr_free_cma",
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ "nr_unaccepted",
+#endif
/* enum numa_stat_item counters */
#ifdef CONFIG_NUMA
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 44ddaf5d601e..02f7f414aade 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1331,31 +1331,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
-#ifdef CONFIG_ZPOOL
- /*
- * Move the zspage to front of pool's LRU.
- *
- * Note that this is swap-specific, so by definition there are no ongoing
- * accesses to the memory while the page is swapped out that would make
- * it "hot". A new entry is hot, then ages to the tail until it gets either
- * written back or swaps back in.
- *
- * Furthermore, map is also called during writeback. We must not put an
- * isolated page on the LRU mid-reclaim.
- *
- * As a result, only update the LRU when the page is mapped for write
- * when it's first instantiated.
- *
- * This is a deviation from the other backends, which perform this update
- * in the allocation function (zbud_alloc, z3fold_alloc).
- */
- if (mm == ZS_MM_WO) {
- if (!list_empty(&zspage->lru))
- list_del(&zspage->lru);
- list_add(&zspage->lru, &pool->lru);
- }
-#endif
-
/*
* migration cannot move any zpages in this zspage. Here, pool->lock
* is too heavy since callers would take some time until they calls
@@ -1525,9 +1500,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
fix_fullness_group(class, zspage);
record_obj(handle, obj);
class_stat_inc(class, ZS_OBJS_INUSE, 1);
- spin_unlock(&pool->lock);
- return handle;
+ goto out;
}
spin_unlock(&pool->lock);
@@ -1550,6 +1524,14 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
+out:
+#ifdef CONFIG_ZPOOL
+ /* Add/move zspage to beginning of LRU */
+ if (!list_empty(&zspage->lru))
+ list_del(&zspage->lru);
+ list_add(&zspage->lru, &pool->lru);
+#endif
+
spin_unlock(&pool->lock);
return handle;
diff --git a/mm/zswap.c b/mm/zswap.c
index e1e621d0b6a0..30092d9a3b23 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1020,6 +1020,22 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
goto fail;
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
+ /*
+ * Having a local reference to the zswap entry doesn't exclude
+ * swapping from invalidating and recycling the swap slot. Once
+ * the swapcache is secured against concurrent swapping to and
+ * from the slot, recheck that the entry is still current before
+ * writing.
+ */
+ spin_lock(&tree->lock);
+ if (zswap_rb_search(&tree->rbroot, entry->offset) != entry) {
+ spin_unlock(&tree->lock);
+ delete_from_swap_cache(page_folio(page));
+ ret = -ENOMEM;
+ goto fail;
+ }
+ spin_unlock(&tree->lock);
+
/* decompress */
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
dlen = PAGE_SIZE;
@@ -1158,9 +1174,16 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto reject;
}
+ /*
+ * XXX: zswap reclaim does not work with cgroups yet. Without a
+ * cgroup-aware entry LRU, we will push out entries system-wide based on
+ * local cgroup limits.
+ */
objcg = get_obj_cgroup_from_page(page);
- if (objcg && !obj_cgroup_may_zswap(objcg))
- goto shrink;
+ if (objcg && !obj_cgroup_may_zswap(objcg)) {
+ ret = -ENOMEM;
+ goto reject;
+ }
/* reclaim space if needed */
if (zswap_is_full()) {