summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c1
-rw-r--r--mm/filemap.c162
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c102
-rw-r--r--mm/internal.h6
-rw-r--r--mm/kasan/common.c3
-rw-r--r--mm/kasan/generic.c7
-rw-r--r--mm/kasan/kasan.h4
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/kasan/shadow.c50
-rw-r--r--mm/khugepaged.c39
-rw-r--r--mm/kmemleak.c5
-rw-r--r--mm/ksm.c7
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c85
-rw-r--r--mm/memory.c17
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/mremap.c25
-rw-r--r--mm/nommu.c9
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/page_io.c8
-rw-r--r--mm/secretmem.c4
-rw-r--r--mm/shmem.c75
-rw-r--r--mm/shrinker_debug.c13
-rw-r--r--mm/swap.c30
-rw-r--r--mm/swapfile.c23
-rw-r--r--mm/vmalloc.c1
-rw-r--r--mm/vmscan.c15
-rw-r--r--mm/zsmalloc.c237
35 files changed, 703 insertions, 269 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index ca1603524bbe..8238e83385a7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1839,6 +1839,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
+ set_pageblock_skip(freepage);
break;
}
}
diff --git a/mm/filemap.c b/mm/filemap.c
index c4d4ace9cc70..b794943bce76 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,6 +42,8 @@
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <linux/migrate.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/splice.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -2440,21 +2442,19 @@ static int filemap_read_folio(struct file *file, filler_t filler,
}
static bool filemap_range_uptodate(struct address_space *mapping,
- loff_t pos, struct iov_iter *iter, struct folio *folio)
+ loff_t pos, size_t count, struct folio *folio,
+ bool need_uptodate)
{
- int count;
-
if (folio_test_uptodate(folio))
return true;
/* pipes can't handle partially uptodate pages */
- if (iov_iter_is_pipe(iter))
+ if (need_uptodate)
return false;
if (!mapping->a_ops->is_partially_uptodate)
return false;
if (mapping->host->i_blkbits >= folio_shift(folio))
return false;
- count = iter->count;
if (folio_pos(folio) > pos) {
count -= folio_pos(folio) - pos;
pos = 0;
@@ -2466,8 +2466,8 @@ static bool filemap_range_uptodate(struct address_space *mapping,
}
static int filemap_update_page(struct kiocb *iocb,
- struct address_space *mapping, struct iov_iter *iter,
- struct folio *folio)
+ struct address_space *mapping, size_t count,
+ struct folio *folio, bool need_uptodate)
{
int error;
@@ -2501,7 +2501,8 @@ static int filemap_update_page(struct kiocb *iocb,
goto unlock;
error = 0;
- if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio))
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
+ need_uptodate))
goto unlock;
error = -EAGAIN;
@@ -2577,8 +2578,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file,
return 0;
}
-static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
- struct folio_batch *fbatch)
+static int filemap_get_pages(struct kiocb *iocb, size_t count,
+ struct folio_batch *fbatch, bool need_uptodate)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
@@ -2588,18 +2589,19 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
struct folio *folio;
int err = 0;
- last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
+ /* "last_index" is the index of the page beyond the end of the read */
+ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
if (fatal_signal_pending(current))
return -EINTR;
- filemap_get_read_batch(mapping, index, last_index, fbatch);
+ filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
- filemap_get_read_batch(mapping, index, last_index, fbatch);
+ filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
}
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
@@ -2621,7 +2623,8 @@ retry:
if ((iocb->ki_flags & IOCB_WAITQ) &&
folio_batch_count(fbatch) > 1)
iocb->ki_flags |= IOCB_NOWAIT;
- err = filemap_update_page(iocb, mapping, iter, folio);
+ err = filemap_update_page(iocb, mapping, count, folio,
+ need_uptodate);
if (err)
goto err;
}
@@ -2691,7 +2694,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;
- error = filemap_get_pages(iocb, iter, &fbatch);
+ error = filemap_get_pages(iocb, iter->count, &fbatch,
+ iov_iter_is_pipe(iter));
if (error < 0)
break;
@@ -2841,6 +2845,134 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
}
EXPORT_SYMBOL(generic_file_read_iter);
+/*
+ * Splice subpages from a folio into a pipe.
+ */
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size)
+{
+ struct page *page;
+ size_t spliced = 0, offset = offset_in_folio(folio, fpos);
+
+ page = folio_page(folio, offset / PAGE_SIZE);
+ size = min(size, folio_size(folio) - offset);
+ offset %= PAGE_SIZE;
+
+ while (spliced < size &&
+ !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
+ size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
+
+ *buf = (struct pipe_buffer) {
+ .ops = &page_cache_pipe_buf_ops,
+ .page = page,
+ .offset = offset,
+ .len = part,
+ };
+ folio_get(folio);
+ pipe->head++;
+ page++;
+ spliced += part;
+ offset = 0;
+ }
+
+ return spliced;
+}
+
+/*
+ * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into
+ * a pipe.
+ */
+ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct folio_batch fbatch;
+ struct kiocb iocb;
+ size_t total_spliced = 0, used, npages;
+ loff_t isize, end_offset;
+ bool writably_mapped;
+ int i, error = 0;
+
+ init_sync_kiocb(&iocb, in);
+ iocb.ki_pos = *ppos;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ folio_batch_init(&fbatch);
+
+ do {
+ cond_resched();
+
+ if (*ppos >= i_size_read(file_inode(in)))
+ break;
+
+ iocb.ki_pos = *ppos;
+ error = filemap_get_pages(&iocb, len, &fbatch, true);
+ if (error < 0)
+ break;
+
+ /*
+ * i_size must be checked after we know the pages are Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(file_inode(in));
+ if (unlikely(*ppos >= isize))
+ break;
+ end_offset = min_t(loff_t, isize, *ppos + len);
+
+ /*
+ * Once we start copying data, we don't want to be touching any
+ * cachelines that might be contended:
+ */
+ writably_mapped = mapping_writably_mapped(in->f_mapping);
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ size_t n;
+
+ if (folio_pos(folio) >= end_offset)
+ goto out;
+ folio_mark_accessed(folio);
+
+ /*
+ * If users can be writing to this folio using arbitrary
+ * virtual addresses, take care of potential aliasing
+ * before reading the folio on the kernel side.
+ */
+ if (writably_mapped)
+ flush_dcache_folio(folio);
+
+ n = min_t(loff_t, len, isize - *ppos);
+ n = splice_folio_into_pipe(pipe, folio, *ppos, n);
+ if (!n)
+ goto out;
+ len -= n;
+ total_spliced += n;
+ *ppos += n;
+ in->f_ra.prev_pos = *ppos;
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ goto out;
+ }
+
+ folio_batch_release(&fbatch);
+ } while (len);
+
+out:
+ folio_batch_release(&fbatch);
+ file_accessed(in);
+
+ return total_spliced ? total_spliced : error;
+}
+EXPORT_SYMBOL(filemap_splice_read);
+
static inline loff_t folio_seek_hole_data(struct xa_state *xas,
struct address_space *mapping, struct folio *folio,
loff_t start, loff_t end, bool seek_data)
diff --git a/mm/gup.c b/mm/gup.c
index f45a3a5be53a..7c034514ddd8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1914,7 +1914,7 @@ static unsigned long collect_longterm_unpinnable_pages(
drain_allow = false;
}
- if (!folio_isolate_lru(folio))
+ if (folio_isolate_lru(folio))
continue;
list_add_tail(&folio->lru, movable_page_list);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index abe6cfd92ffa..1b791b26d72d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3272,8 +3272,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
- if (is_writable_migration_entry(entry))
- pmde = maybe_pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
if (!is_migration_entry_young(entry))
@@ -3281,6 +3279,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
/* NOTE: this may contain setting soft-dirty on some archs */
if (PageDirty(new) && is_migration_entry_dirty(entry))
pmde = pmd_mkdirty(pmde);
+ if (is_writable_migration_entry(entry))
+ pmde = maybe_pmd_mkwrite(pmde, vma);
+ else
+ pmde = pmd_wrprotect(pmde);
if (PageAnon(new)) {
rmap_t rmap_flags = RMAP_COMPOUND;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db895230ee7e..bdbfeb6fb393 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -94,6 +94,8 @@ static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -1181,7 +1183,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
/*
* Reset and decrement one ref on hugepage private reservation.
- * Called with mm->mmap_sem writer semaphore held.
+ * Called with mm->mmap_lock writer semaphore held.
* This function should be only used by move_vma() and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
@@ -4834,6 +4836,25 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+
+ /*
+ * PMD sharing is only possible for PUD_SIZE-aligned address ranges
+ * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
+ * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ */
+ if (addr & ~PUD_MASK) {
+ /*
+ * hugetlb_vm_op_split is called right before we attempt to
+ * split the VMA. We will need to unshare PMDs in the old and
+ * new VMAs, so let's unshare before we split.
+ */
+ unsigned long floor = addr & PUD_MASK;
+ unsigned long ceil = floor + PUD_SIZE;
+
+ if (floor >= vma->vm_start && ceil <= vma->vm_end)
+ hugetlb_unshare_pmds(vma, floor, ceil);
+ }
+
return 0;
}
@@ -5030,6 +5051,9 @@ again:
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(dst, addr, dst_pte, entry);
} else if (unlikely(is_pte_marker(entry))) {
+ /* No swap on hugetlb */
+ WARN_ON_ONCE(
+ is_swapin_error_entry(pte_to_swp_entry(entry)));
/*
* We copy the pte marker only if the dst vma has
* uffd-wp enabled.
@@ -5131,7 +5155,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
/*
* We don't have to worry about the ordering of src and dst ptlocks
- * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
+ * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
*/
if (src_ptl != dst_ptl)
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -6639,8 +6663,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, psize);
if (!ptep) {
- address |= last_addr_mask;
- continue;
+ if (!uffd_wp) {
+ address |= last_addr_mask;
+ continue;
+ }
+ /*
+ * Userfaultfd wr-protect requires pgtable
+ * pre-allocations to install pte markers.
+ */
+ ptep = huge_pte_alloc(mm, vma, address, psize);
+ if (!ptep)
+ break;
}
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, address, ptep)) {
@@ -6658,16 +6691,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
pte = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
- spin_unlock(ptl);
- continue;
- }
- if (unlikely(is_hugetlb_entry_migration(pte))) {
+ /* Nothing to do. */
+ } else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
struct page *page = pfn_swap_entry_to_page(entry);
+ pte_t newpte = pte;
- if (!is_readable_migration_entry(entry)) {
- pte_t newpte;
-
+ if (is_writable_migration_entry(entry)) {
if (PageAnon(page))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
@@ -6675,25 +6705,22 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
entry = make_readable_migration_entry(
swp_offset(entry));
newpte = swp_entry_to_pte(entry);
- if (uffd_wp)
- newpte = pte_swp_mkuffd_wp(newpte);
- else if (uffd_wp_resolve)
- newpte = pte_swp_clear_uffd_wp(newpte);
- set_huge_pte_at(mm, address, ptep, newpte);
pages++;
}
- spin_unlock(ptl);
- continue;
- }
- if (unlikely(pte_marker_uffd_wp(pte))) {
- /*
- * This is changing a non-present pte into a none pte,
- * no need for huge_ptep_modify_prot_start/commit().
- */
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+ if (!pte_same(pte, newpte))
+ set_huge_pte_at(mm, address, ptep, newpte);
+ } else if (unlikely(is_pte_marker(pte))) {
+ /* No other markers apply for now. */
+ WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
if (uffd_wp_resolve)
+ /* Safe to modify directly (non-present->none). */
huge_pte_clear(mm, address, ptep, psize);
- }
- if (!huge_pte_none(pte)) {
+ } else if (!huge_pte_none(pte)) {
pte_t old_pte;
unsigned int shift = huge_page_shift(hstate_vma(vma));
@@ -7328,26 +7355,21 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}
}
-/*
- * This function will unconditionally remove all the shared pmd pgtable entries
- * within the specific vma for a hugetlbfs memory range.
- */
-void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
- unsigned long address, start, end;
+ unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- start = ALIGN(vma->vm_start, PUD_SIZE);
- end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
-
if (start >= end)
return;
@@ -7379,6 +7401,16 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_invalidate_range_end(&range);
}
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+ hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+}
+
#ifdef CONFIG_CMA
static bool cma_reserve_called __initdata;
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8b032d..6d4ca98f3844 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -795,6 +795,12 @@ struct migration_target_control {
};
/*
+ * mm/filemap.c
+ */
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size);
+
+/*
* mm/vmalloc.c
*/
#ifdef CONFIG_MMU
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 833bf2cfd2a3..21e66d7f261d 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -246,6 +246,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object,
static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
{
+ if (!kasan_arch_is_ready())
+ return false;
+
if (ptr != page_address(virt_to_head_page(ptr))) {
kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE);
return true;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index b076f597a378..cb762982c8ba 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -191,7 +191,12 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
bool kasan_byte_accessible(const void *addr)
{
- s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
+ s8 shadow_byte;
+
+ if (!kasan_arch_is_ready())
+ return true;
+
+ shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE;
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ea8cf1310b1e..71c15438afcf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -618,6 +618,10 @@ void __asan_set_shadow_f3(const void *addr, size_t size);
void __asan_set_shadow_f5(const void *addr, size_t size);
void __asan_set_shadow_f8(const void *addr, size_t size);
+void *__asan_memset(void *addr, int c, size_t len);
+void *__asan_memmove(void *dest, const void *src, size_t len);
+void *__asan_memcpy(void *dest, const void *src, size_t len);
+
void __hwasan_load1_noabort(unsigned long addr);
void __hwasan_store1_noabort(unsigned long addr);
void __hwasan_load2_noabort(unsigned long addr);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 1d02757e90a3..22598b20c7b7 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
* Whether the KASAN KUnit test suite is currently being executed.
* Updated in kasan_test.c.
*/
-bool kasan_kunit_executing;
+static bool kasan_kunit_executing;
void kasan_kunit_test_suite_start(void)
{
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 2fba1f51f042..3703983a8e55 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -38,6 +38,12 @@ bool __kasan_check_write(const volatile void *p, unsigned int size)
}
EXPORT_SYMBOL(__kasan_check_write);
+#ifndef CONFIG_GENERIC_ENTRY
+/*
+ * CONFIG_GENERIC_ENTRY relies on compiler emitted mem*() calls to not be
+ * instrumented. KASAN enabled toolchains should emit __asan_mem*() functions
+ * for the sites they want to instrument.
+ */
#undef memset
void *memset(void *addr, int c, size_t len)
{
@@ -68,6 +74,38 @@ void *memcpy(void *dest, const void *src, size_t len)
return __memcpy(dest, src, len);
}
+#endif
+
+void *__asan_memset(void *addr, int c, size_t len)
+{
+ if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
+ return NULL;
+
+ return __memset(addr, c, len);
+}
+EXPORT_SYMBOL(__asan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__asan_memmove(void *dest, const void *src, size_t len)
+{
+ if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+ !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memmove(dest, src, len);
+}
+EXPORT_SYMBOL(__asan_memmove);
+#endif
+
+void *__asan_memcpy(void *dest, const void *src, size_t len)
+{
+ if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+ !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memcpy(dest, src, len);
+}
+EXPORT_SYMBOL(__asan_memcpy);
void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
@@ -291,6 +329,9 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
unsigned long shadow_start, shadow_end;
int ret;
+ if (!kasan_arch_is_ready())
+ return 0;
+
if (!is_vmalloc_or_module_addr((void *)addr))
return 0;
@@ -459,6 +500,9 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long region_start, region_end;
unsigned long size;
+ if (!kasan_arch_is_ready())
+ return;
+
region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE);
region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE);
@@ -502,6 +546,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
* with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
*/
+ if (!kasan_arch_is_ready())
+ return (void *)start;
+
if (!is_vmalloc_or_module_addr(start))
return (void *)start;
@@ -524,6 +571,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
*/
void __kasan_poison_vmalloc(const void *start, unsigned long size)
{
+ if (!kasan_arch_is_ready())
+ return;
+
if (!is_vmalloc_or_module_addr(start))
return;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5cb401aa2b9d..a26a28e3738c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
+/*
+ * See pmd_trans_unstable() for how the result may change out from
+ * underneath us, even if we hold mmap_lock in read.
+ */
static int find_pmd_or_thp_or_none(struct mm_struct *mm,
unsigned long address,
pmd_t **pmd)
@@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
#endif
if (pmd_none(pmde))
return SCAN_PMD_NONE;
+ if (!pmd_present(pmde))
+ return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
+ if (pmd_devmap(pmde))
+ return SCAN_PMD_NULL;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
@@ -1460,14 +1468,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
return SCAN_VMA_CHECK;
- /*
- * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
- * that got written to. Without this, we'd have to also lock the
- * anon_vma if one exists.
- */
- if (vma->anon_vma)
- return SCAN_VMA_CHECK;
-
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;
@@ -1567,8 +1567,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
}
/* step 4: remove pte entries */
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
collapse_and_free_pmd(mm, vma, haddr, pmd);
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
i_mmap_unlock_write(vma->vm_file->f_mapping);
maybe_install_pmd:
@@ -1644,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* has higher cost too. It would also probably require locking
* the anon_vma.
*/
- if (vma->anon_vma) {
+ if (READ_ONCE(vma->anon_vma)) {
result = SCAN_PAGE_ANON;
goto next;
}
@@ -1673,6 +1679,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
if ((cc->is_khugepaged || is_target) &&
mmap_write_trylock(mm)) {
/*
+ * Re-check whether we have an ->anon_vma, because
+ * collapse_and_free_pmd() requires that either no
+ * ->anon_vma exists or the anon_vma is locked.
+ * We already checked ->anon_vma above, but that check
+ * is racy because ->anon_vma can be populated under the
+ * mmap lock in read mode.
+ */
+ if (vma->anon_vma) {
+ result = SCAN_PAGE_ANON;
+ goto unlock_next;
+ }
+ /*
* When a vma is registered with uffd-wp, we can't
* recycle the pmd pgtable because there can be pte
* markers installed. Skip it only, so the rest mm/vma
@@ -2593,6 +2611,7 @@ static int madvise_collapse_errno(enum scan_result r)
case SCAN_CGROUP_CHARGE_FAIL:
return -EBUSY;
/* Resource temporary unavailable - trying again might succeed */
+ case SCAN_PAGE_COUNT:
case SCAN_PAGE_LOCK:
case SCAN_PAGE_LRU:
case SCAN_DEL_PAGE_LRU:
@@ -2649,7 +2668,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto out_nolock;
}
- hend = vma->vm_end & HPAGE_PMD_MASK;
+ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 92f670edbf51..55dc8b8b0616 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -2070,8 +2070,10 @@ static int __init kmemleak_boot_config(char *str)
return -EINVAL;
if (strcmp(str, "off") == 0)
kmemleak_disable();
- else if (strcmp(str, "on") == 0)
+ else if (strcmp(str, "on") == 0) {
kmemleak_skip_disable = 1;
+ stack_depot_want_early_init();
+ }
else
return -EINVAL;
return 0;
@@ -2093,7 +2095,6 @@ void __init kmemleak_init(void)
if (kmemleak_error)
return;
- stack_depot_init();
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
diff --git a/mm/ksm.c b/mm/ksm.c
index dd02780c387f..addf490da146 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2629,8 +2629,11 @@ struct page *ksm_might_need_to_copy(struct page *page,
new_page = NULL;
}
if (new_page) {
- copy_user_highpage(new_page, page, address, vma);
-
+ if (copy_mc_user_highpage(new_page, page, address, vma)) {
+ put_page(new_page);
+ memory_failure_queue(page_to_pfn(page), 0);
+ return ERR_PTR(-EHWPOISON);
+ }
SetPageDirty(new_page);
__SetPageUptodate(new_page);
__SetPageLocked(new_page);
diff --git a/mm/madvise.c b/mm/madvise.c
index a56a6d17e201..18c2e2affac4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -130,7 +130,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
#endif /* CONFIG_ANON_VMA_NAME */
/*
* Update the vm_flags on region of a vma, splitting it or merging it as
- * necessary. Must be called with mmap_sem held for writing;
+ * necessary. Must be called with mmap_lock held for writing;
* Caller should ensure anon_name stability by raising its refcount even when
* anon_name belongs to a valid vma because this function might free that vma.
*/
@@ -329,7 +329,7 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
* otherwise we'd be including shared non-exclusive mappings, which
* opens a side channel.
*/
- return inode_owner_or_capable(&init_user_ns,
+ return inode_owner_or_capable(&nop_mnt_idmap,
file_inode(vma->vm_file)) ||
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 511d4783dcf1..d036c7861310 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -836,7 +836,7 @@ void __init_memblock memblock_free(void *ptr, size_t size)
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
- * Free boot memory block previously allocated by memblock_alloc_xx() API.
+ * Free boot memory block previously allocated by memblock_phys_alloc_xx() API.
* The freeing memory will not be released to the buddy allocator.
*/
int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ab457f0394ab..49f40730e711 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,7 +63,6 @@
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
-#include <linux/parser.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -89,6 +88,9 @@ static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem __ro_after_init;
+/* BPF memory accounting disabled? */
+static bool cgroup_memory_nobpf __ro_after_init;
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -348,6 +350,9 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
+
+DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
+EXPORT_SYMBOL(memcg_bpf_enabled_key);
#endif
/**
@@ -2393,8 +2398,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
- MEMCG_RECLAIM_MAY_SWAP,
- NULL);
+ MEMCG_RECLAIM_MAY_SWAP);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2685,8 +2689,7 @@ retry:
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, reclaim_options,
- NULL);
+ gfp_mask, reclaim_options);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3506,8 +3509,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP,
- NULL)) {
+ memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
ret = -EBUSY;
break;
}
@@ -3618,8 +3620,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- MEMCG_RECLAIM_MAY_SWAP,
- NULL))
+ MEMCG_RECLAIM_MAY_SWAP))
nr_retries--;
}
@@ -5362,6 +5363,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
+#if defined(CONFIG_MEMCG_KMEM)
+ if (!cgroup_memory_nobpf)
+ static_branch_inc(&memcg_bpf_enabled_key);
+#endif
+
return &memcg->css;
}
@@ -5446,6 +5452,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);
+#if defined(CONFIG_MEMCG_KMEM)
+ if (!cgroup_memory_nobpf)
+ static_branch_dec(&memcg_bpf_enabled_key);
+#endif
+
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
@@ -6429,8 +6440,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
- NULL);
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
if (!reclaimed && !nr_retries--)
break;
@@ -6479,8 +6489,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
- NULL))
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
nr_reclaims--;
continue;
}
@@ -6603,54 +6612,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
-enum {
- MEMORY_RECLAIM_NODES = 0,
- MEMORY_RECLAIM_NULL,
-};
-
-static const match_table_t if_tokens = {
- { MEMORY_RECLAIM_NODES, "nodes=%s" },
- { MEMORY_RECLAIM_NULL, NULL },
-};
-
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
- unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
- MEMCG_RECLAIM_PROACTIVE;
- char *old_buf, *start;
- substring_t args[MAX_OPT_ARGS];
- int token;
- char value[256];
- nodemask_t nodemask = NODE_MASK_ALL;
-
- buf = strstrip(buf);
-
- old_buf = buf;
- nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
- if (buf == old_buf)
- return -EINVAL;
+ unsigned int reclaim_options;
+ int err;
buf = strstrip(buf);
+ err = page_counter_memparse(buf, "", &nr_to_reclaim);
+ if (err)
+ return err;
- while ((start = strsep(&buf, " ")) != NULL) {
- if (!strlen(start))
- continue;
- token = match_token(start, if_tokens, args);
- match_strlcpy(value, args, sizeof(value));
- switch (token) {
- case MEMORY_RECLAIM_NODES:
- if (nodelist_parse(value, nodemask) < 0)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- }
-
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed;
@@ -6667,8 +6643,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed,
- GFP_KERNEL, reclaim_options,
- &nodemask);
+ GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
@@ -7310,6 +7285,8 @@ static int __init cgroup_memory(char *s)
cgroup_memory_nosocket = true;
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
+ if (!strcmp(token, "nobpf"))
+ cgroup_memory_nobpf = true;
}
return 1;
}
diff --git a/mm/memory.c b/mm/memory.c
index aad226daf41b..f526b9152bef 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -828,12 +828,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
- /*
- * We're copying the pgtable should only because dst_vma has
- * uffd-wp enabled, do sanity check.
- */
- WARN_ON_ONCE(!userfaultfd_wp(dst_vma));
- set_pte_at(dst_mm, addr, dst_pte, pte);
+ if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
+ set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
if (!userfaultfd_wp(dst_vma))
@@ -3629,8 +3625,12 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
/*
* Be careful so that we will only recover a special uffd-wp pte into a
* none pte. Otherwise it means the pte could have changed, so retry.
+ *
+ * This should also cover the case where e.g. the pte changed
+ * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
+ * So is_pte_marker() check is not enough to safely drop the pte.
*/
- if (is_pte_marker(*vmf->pte))
+ if (pte_same(vmf->orig_pte, *vmf->pte))
pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@@ -3840,6 +3840,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
goto out_page;
+ } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
+ ret = VM_FAULT_HWPOISON;
+ goto out_page;
}
folio = page_folio(page);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 02c8a712282f..f940395667c8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -600,7 +600,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
+ !hugetlb_pmd_shared(pte))) {
if (isolate_hugetlb(page, qp->pagelist) &&
(flags & MPOL_MF_STRICT))
/*
diff --git a/mm/migrate.c b/mm/migrate.c
index a4d3fc65085f..cc5455614e01 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -224,6 +224,8 @@ static bool remove_migration_pte(struct folio *folio,
pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
+ else
+ pte = pte_wrprotect(pte);
if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
rmap_flags |= RMAP_EXCLUSIVE;
diff --git a/mm/mincore.c b/mm/mincore.c
index a085a2aeabd8..cd69b9db0081 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -168,7 +168,7 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
* for writing; otherwise we'd be including shared non-exclusive
* mappings, which opens a side channel.
*/
- return inode_owner_or_capable(&init_user_ns,
+ return inode_owner_or_capable(&nop_mnt_idmap,
file_inode(vma->vm_file)) ||
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 87d929316d57..425a9349e610 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1524,6 +1524,10 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_wp(vma))
+ return 1;
+
/* Specialty mapping? */
if (vm_flags & VM_PFNMAP)
return 0;
@@ -2290,7 +2294,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma,
* @start: The aligned start address to munmap.
* @end: The aligned end address to munmap.
* @uf: The userfaultfd list_head
- * @downgrade: Set to true to attempt a write downgrade of the mmap_sem
+ * @downgrade: Set to true to attempt a write downgrade of the mmap_lock
*
* If @downgrade is true, check return code for potential release of the lock.
*/
@@ -2465,7 +2469,7 @@ map_count_exceeded:
* @len: The length of the range to munmap
* @uf: The userfaultfd list_head
* @downgrade: set to true if the user wants to attempt to write_downgrade the
- * mmap_sem
+ * mmap_lock
*
* This function takes a @mas that is either pointing to the previous VMA or set
* to MA_START and sets it up to remove the mapping(s). The @len will be
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 908df12caa26..61cf60015a8b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -245,7 +245,13 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
newpte = pte_swp_mksoft_dirty(newpte);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
- } else if (pte_marker_entry_uffd_wp(entry)) {
+ } else if (is_pte_marker_entry(entry)) {
+ /*
+ * Ignore swapin errors unconditionally,
+ * because any access should sigbus anyway.
+ */
+ if (is_swapin_error_entry(entry))
+ continue;
/*
* If this is uffd-wp pte marker and we'd like
* to unprotect it, drop it; the next page
diff --git a/mm/mremap.c b/mm/mremap.c
index fe587c5d6591..930f65c315c0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1027,16 +1027,29 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
/*
- * Function vma_merge() is called on the extension we are adding to
- * the already existing vma, vma_merge() will merge this extension with
- * the already existing vma (expand operation itself) and possibly also
- * with the next vma if it becomes adjacent to the expanded vma and
- * otherwise compatible.
+ * Function vma_merge() is called on the extension we
+ * are adding to the already existing vma, vma_merge()
+ * will merge this extension with the already existing
+ * vma (expand operation itself) and possibly also with
+ * the next vma if it becomes adjacent to the expanded
+ * vma and otherwise compatible.
+ *
+ * However, vma_merge() can currently fail due to
+ * is_mergeable_vma() check for vm_ops->close (see the
+ * comment there). Yet this should not prevent vma
+ * expanding, so perform a simple expand for such vma.
+ * Ideally the check for close op should be only done
+ * when a vma would be actually removed due to a merge.
*/
- vma = vma_merge(mm, vma, extension_start, extension_end,
+ if (!vma->vm_ops || !vma->vm_ops->close) {
+ vma = vma_merge(mm, vma, extension_start, extension_end,
vma->vm_flags, vma->anon_vma, vma->vm_file,
extension_pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ } else if (vma_adjust(vma, vma->vm_start, addr + new_len,
+ vma->vm_pgoff, NULL)) {
+ vma = NULL;
+ }
if (!vma) {
vm_unacct_memory(pages);
ret = -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 214c70e1d059..5b83938ecb67 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -559,7 +559,6 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
{
- mm->map_count++;
vma->vm_mm = mm;
/* add the VMA to the mapping */
@@ -587,6 +586,7 @@ static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm,
BUG_ON(!vma->vm_region);
setup_vma_to_mm(vma, mm);
+ mm->map_count++;
/* add the VMA to the tree */
vma_mas_store(vma, mas);
@@ -1240,6 +1240,7 @@ share:
error_just_free:
up_write(&nommu_region_sem);
error:
+ mas_destroy(&mas);
if (region->vm_file)
fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
@@ -1250,7 +1251,6 @@ error:
sharing_violation:
up_write(&nommu_region_sem);
- mas_destroy(&mas);
pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;
@@ -1347,6 +1347,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_file)
return -ENOMEM;
+ mm = vma->vm_mm;
if (mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
@@ -1398,6 +1399,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
mas_set_range(&mas, vma->vm_start, vma->vm_end - 1);
mas_store(&mas, vma);
vma_mas_store(new, &mas);
+ mm->map_count++;
return 0;
err_mas_preallocate:
@@ -1509,7 +1511,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
erase_whole_vma:
if (delete_vma_from_mm(vma))
ret = -ENOMEM;
- delete_vma(mm, vma);
+ else
+ delete_vma(mm, vma);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0745aedebb37..3bb3484563ed 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5631,9 +5631,12 @@ EXPORT_SYMBOL(get_zeroed_page);
*/
void __free_pages(struct page *page, unsigned int order)
{
+ /* get PageHead before we drop reference */
+ int head = PageHead(page);
+
if (put_page_testzero(page))
free_the_page(page, order);
- else if (!PageHead(page))
+ else if (!head)
while (order-- > 0)
free_the_page(page + (1 << order), order);
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 3a5f921b932e..233f6e6eb1c5 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -318,9 +318,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
sio->pages = 0;
sio->len = 0;
}
- sio->bvec[sio->pages].bv_page = page;
- sio->bvec[sio->pages].bv_len = thp_size(page);
- sio->bvec[sio->pages].bv_offset = 0;
+ bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
sio->len += thp_size(page);
sio->pages += 1;
if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
@@ -432,9 +430,7 @@ static void swap_readpage_fs(struct page *page,
sio->pages = 0;
sio->len = 0;
}
- sio->bvec[sio->pages].bv_page = page;
- sio->bvec[sio->pages].bv_len = thp_size(page);
- sio->bvec[sio->pages].bv_offset = 0;
+ bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
sio->len += thp_size(page);
sio->pages += 1;
if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 04c3ac9448a1..afcf46e99cda 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -162,7 +162,7 @@ const struct address_space_operations secretmem_aops = {
.migrate_folio = secretmem_migrate_folio,
};
-static int secretmem_setattr(struct user_namespace *mnt_userns,
+static int secretmem_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
@@ -175,7 +175,7 @@ static int secretmem_setattr(struct user_namespace *mnt_userns,
if ((ia_valid & ATTR_SIZE) && inode->i_size)
ret = -EINVAL;
else
- ret = simple_setattr(mnt_userns, dentry, iattr);
+ ret = simple_setattr(idmap, dentry, iattr);
filemap_invalidate_unlock(mapping);
diff --git a/mm/shmem.c b/mm/shmem.c
index c301487be5fb..41f82c5a5e28 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -478,12 +478,10 @@ bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
return false;
- if (shmem_huge_force)
- return true;
- if (shmem_huge == SHMEM_HUGE_FORCE)
- return true;
if (shmem_huge == SHMEM_HUGE_DENY)
return false;
+ if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
+ return true;
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
@@ -1047,7 +1045,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_getattr(struct user_namespace *mnt_userns,
+static int shmem_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
@@ -1068,7 +1066,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(idmap, inode, stat);
if (shmem_is_huge(NULL, inode, 0, false))
stat->blksize = HPAGE_PMD_SIZE;
@@ -1082,7 +1080,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
return 0;
}
-static int shmem_setattr(struct user_namespace *mnt_userns,
+static int shmem_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -1091,7 +1089,7 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
bool update_mtime = false;
bool update_ctime = true;
- error = setattr_prepare(&init_user_ns, dentry, attr);
+ error = setattr_prepare(idmap, dentry, attr);
if (error)
return error;
@@ -1129,9 +1127,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
}
}
- setattr_copy(&init_user_ns, inode, attr);
+ setattr_copy(idmap, inode, attr);
if (attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
+ error = posix_acl_chmod(idmap, dentry, inode->i_mode);
if (!error && update_ctime) {
inode->i_ctime = current_time(inode);
if (update_mtime)
@@ -2329,8 +2327,9 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
#define shmem_initxattrs NULL
#endif
-static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
- umode_t mode, dev_t dev, unsigned long flags)
+static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
+ struct inode *dir, umode_t mode, dev_t dev,
+ unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
@@ -2343,7 +2342,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
inode = new_inode(sb);
if (inode) {
inode->i_ino = ino;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(idmap, inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_generation = get_random_u32();
@@ -2915,13 +2914,13 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
* File creation. Allocate an inode, and we're done..
*/
static int
-shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
int error = -ENOSPC;
- inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
error = simple_acl_create(dir, inode);
if (error)
@@ -2946,13 +2945,13 @@ out_iput:
}
static int
-shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode)
{
struct inode *inode;
int error = -ENOSPC;
- inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
if (inode) {
error = security_inode_init_security(inode, dir,
NULL,
@@ -2970,22 +2969,22 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
int error;
- if ((error = shmem_mknod(&init_user_ns, dir, dentry,
- mode | S_IFDIR, 0)))
+ error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
+ if (error)
return error;
inc_nlink(dir);
return 0;
}
-static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
+static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
- return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
+ return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}
/*
@@ -3045,7 +3044,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
return shmem_unlink(dir, dentry);
}
-static int shmem_whiteout(struct user_namespace *mnt_userns,
+static int shmem_whiteout(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry)
{
struct dentry *whiteout;
@@ -3055,7 +3054,7 @@ static int shmem_whiteout(struct user_namespace *mnt_userns,
if (!whiteout)
return -ENOMEM;
- error = shmem_mknod(&init_user_ns, old_dir, whiteout,
+ error = shmem_mknod(idmap, old_dir, whiteout,
S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
dput(whiteout);
if (error)
@@ -3078,7 +3077,7 @@ static int shmem_whiteout(struct user_namespace *mnt_userns,
* it exists so that the VFS layer correctly free's it when it
* gets overwritten.
*/
-static int shmem_rename2(struct user_namespace *mnt_userns,
+static int shmem_rename2(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
@@ -3098,7 +3097,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns,
if (flags & RENAME_WHITEOUT) {
int error;
- error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
+ error = shmem_whiteout(idmap, old_dir, old_dentry);
if (error)
return error;
}
@@ -3124,7 +3123,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns,
return 0;
}
-static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
int error;
@@ -3136,7 +3135,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (len > PAGE_SIZE)
return -ENAMETOOLONG;
- inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
VM_NORESERVE);
if (!inode)
return -ENOSPC;
@@ -3229,7 +3228,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
return 0;
}
-static int shmem_fileattr_set(struct user_namespace *mnt_userns,
+static int shmem_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa)
{
struct inode *inode = d_inode(dentry);
@@ -3303,7 +3302,7 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
}
static int shmem_xattr_handler_set(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
+ struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
@@ -3819,7 +3818,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
uuid_gen(&sb->s_uuid);
- inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
+ VM_NORESERVE);
if (!inode)
goto failed;
inode->i_uid = sbinfo->uid;
@@ -4044,7 +4044,11 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
+#ifdef CONFIG_SHMEM
+ .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+#else
.fs_flags = FS_USERNS_MOUNT,
+#endif
};
void __init shmem_init(void)
@@ -4196,7 +4200,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_vm_ops generic_file_vm_ops
#define shmem_anon_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
-#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
+#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
@@ -4219,8 +4223,11 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
- inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
- flags);
+ if (is_idmapped_mnt(mnt))
+ return ERR_PTR(-EINVAL);
+
+ inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
+ S_IFREG | S_IRWXUGO, 0, flags);
if (unlikely(!inode)) {
shmem_unacct_size(flags, size);
return ERR_PTR(-ENOSPC);
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index b05295bab322..39c3491e28a3 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -246,18 +246,21 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
}
EXPORT_SYMBOL(shrinker_debugfs_rename);
-void shrinker_debugfs_remove(struct shrinker *shrinker)
+struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker)
{
+ struct dentry *entry = shrinker->debugfs_entry;
+
lockdep_assert_held(&shrinker_rwsem);
kfree_const(shrinker->name);
shrinker->name = NULL;
- if (!shrinker->debugfs_entry)
- return;
+ if (entry) {
+ ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
+ shrinker->debugfs_entry = NULL;
+ }
- debugfs_remove_recursive(shrinker->debugfs_entry);
- ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
+ return entry;
}
static int __init shrinker_debugfs_init(void)
diff --git a/mm/swap.c b/mm/swap.c
index 70e2063ef43a..4c03ecab698e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -158,36 +158,6 @@ void put_pages_list(struct list_head *pages)
}
EXPORT_SYMBOL(put_pages_list);
-/*
- * get_kernel_pages() - pin kernel pages in memory
- * @kiov: An array of struct kvec structures
- * @nr_segs: number of segments to pin
- * @write: pinning for read/write, currently ignored
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_segs long.
- *
- * Returns number of pages pinned. This may be fewer than the number requested.
- * If nr_segs is 0 or negative, returns 0. If no pages were pinned, returns 0.
- * Each page returned must be released with a put_page() call when it is
- * finished with.
- */
-int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
- struct page **pages)
-{
- int seg;
-
- for (seg = 0; seg < nr_segs; seg++) {
- if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
- return seg;
-
- pages[seg] = kmap_to_page(kiov[seg].iov_base);
- get_page(pages[seg]);
- }
-
- return seg;
-}
-EXPORT_SYMBOL_GPL(get_kernel_pages);
-
typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 908a529bca12..0ab52d16bde6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1100,6 +1100,7 @@ start_over:
goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
si->type);
+ cond_resched();
spin_lock(&swap_avail_lock);
nextsi:
@@ -1763,12 +1764,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
struct page *swapcache;
spinlock_t *ptl;
pte_t *pte, new_pte;
+ bool hwposioned = false;
int ret = 1;
swapcache = page;
page = ksm_might_need_to_copy(page, vma, addr);
if (unlikely(!page))
return -ENOMEM;
+ else if (unlikely(PTR_ERR(page) == -EHWPOISON))
+ hwposioned = true;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
@@ -1776,15 +1780,19 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
- if (unlikely(!PageUptodate(page))) {
- pte_t pteval;
+ if (unlikely(hwposioned || !PageUptodate(page))) {
+ swp_entry_t swp_entry;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
- pteval = swp_entry_to_pte(make_swapin_error_entry());
- set_pte_at(vma->vm_mm, addr, pte, pteval);
- swap_free(entry);
+ if (hwposioned) {
+ swp_entry = make_hwpoison_entry(swapcache);
+ page = swapcache;
+ } else {
+ swp_entry = make_swapin_error_entry();
+ }
+ new_pte = swp_entry_to_pte(swp_entry);
ret = 0;
- goto out;
+ goto setpte;
}
/* See do_swap_page() */
@@ -1816,6 +1824,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
new_pte = pte_mksoft_dirty(new_pte);
if (pte_swp_uffd_wp(*pte))
new_pte = pte_mkuffd_wp(new_pte);
+setpte:
set_pte_at(vma->vm_mm, addr, pte, new_pte);
swap_free(entry);
out:
@@ -3642,7 +3651,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
* We've already scheduled a throttle, avoid taking the global swap
* lock.
*/
- if (current->throttle_queue)
+ if (current->throttle_disk)
return;
spin_lock(&swap_avail_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ca71de7c9d77..61f5bec0f2b6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -656,6 +656,7 @@ int is_vmalloc_or_module_addr(const void *x)
#endif
return is_vmalloc_addr(x);
}
+EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
/*
* Walk a vmap address to the struct page it maps. Huge vmap mappings will
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd6637fcd8f9..5b7b8d4f5297 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -741,6 +741,8 @@ EXPORT_SYMBOL(register_shrinker);
*/
void unregister_shrinker(struct shrinker *shrinker)
{
+ struct dentry *debugfs_entry;
+
if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
@@ -749,9 +751,11 @@ void unregister_shrinker(struct shrinker *shrinker)
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
- shrinker_debugfs_remove(shrinker);
+ debugfs_entry = shrinker_debugfs_remove(shrinker);
up_write(&shrinker_rwsem);
+ debugfs_remove_recursive(debugfs_entry);
+
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
@@ -3323,13 +3327,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
if (mem_cgroup_disabled())
return;
+ /* migration can happen before addition */
+ if (!mm->lru_gen.memcg)
+ return;
+
rcu_read_lock();
memcg = mem_cgroup_from_task(task);
rcu_read_unlock();
if (memcg == mm->lru_gen.memcg)
return;
- VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
lru_gen_del_mm(mm);
@@ -6754,8 +6761,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- unsigned int reclaim_options,
- nodemask_t *nodemask)
+ unsigned int reclaim_options)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@@ -6770,7 +6776,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
- .nodemask = nodemask,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 9445bee6b014..702bc3fd687a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -113,7 +113,23 @@
* have room for two bit at least.
*/
#define OBJ_ALLOCATED_TAG 1
-#define OBJ_TAG_BITS 1
+
+#ifdef CONFIG_ZPOOL
+/*
+ * The second least-significant bit in the object's header identifies if the
+ * value stored at the header is a deferred handle from the last reclaim
+ * attempt.
+ *
+ * As noted above, this is valid because we have room for two bits.
+ */
+#define OBJ_DEFERRED_HANDLE_TAG 2
+#define OBJ_TAG_BITS 2
+#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
+#else
+#define OBJ_TAG_BITS 1
+#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
+#endif /* CONFIG_ZPOOL */
+
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
@@ -222,6 +238,12 @@ struct link_free {
* Handle of allocated object.
*/
unsigned long handle;
+#ifdef CONFIG_ZPOOL
+ /*
+ * Deferred handle of a reclaimed object.
+ */
+ unsigned long deferred_handle;
+#endif
};
};
@@ -272,8 +294,6 @@ struct zspage {
/* links the zspage to the lru list in the pool */
struct list_head lru;
bool under_reclaim;
- /* list of unfreed handles whose objects have been reclaimed */
- unsigned long *deferred_handles;
#endif
struct zs_pool *pool;
@@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
+static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
+ int tag)
{
unsigned long handle;
struct zspage *zspage = get_zspage(page);
@@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
} else
handle = *(unsigned long *)obj;
- if (!(handle & OBJ_ALLOCATED_TAG))
+ if (!(handle & tag))
return false;
- *phandle = handle & ~OBJ_ALLOCATED_TAG;
+ /* Clear all tags before returning the handle */
+ *phandle = handle & ~OBJ_TAG_MASK;
return true;
}
+static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
+{
+ return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
+}
+
+#ifdef CONFIG_ZPOOL
+static bool obj_stores_deferred_handle(struct page *page, void *obj,
+ unsigned long *phandle)
+{
+ return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
+}
+#endif
+
static void reset_page(struct page *page)
{
__ClearPageMovable(page);
@@ -946,22 +981,36 @@ unlock:
}
#ifdef CONFIG_ZPOOL
+static unsigned long find_deferred_handle_obj(struct size_class *class,
+ struct page *page, int *obj_idx);
+
/*
* Free all the deferred handles whose objects are freed in zs_free.
*/
-static void free_handles(struct zs_pool *pool, struct zspage *zspage)
+static void free_handles(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
{
- unsigned long handle = (unsigned long)zspage->deferred_handles;
+ int obj_idx = 0;
+ struct page *page = get_first_page(zspage);
+ unsigned long handle;
- while (handle) {
- unsigned long nxt_handle = handle_to_obj(handle);
+ while (1) {
+ handle = find_deferred_handle_obj(class, page, &obj_idx);
+ if (!handle) {
+ page = get_next_page(page);
+ if (!page)
+ break;
+ obj_idx = 0;
+ continue;
+ }
cache_free_handle(pool, handle);
- handle = nxt_handle;
+ obj_idx++;
}
}
#else
-static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
+static inline void free_handles(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage) {}
#endif
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
@@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(fg != ZS_EMPTY);
/* Free all deferred handles from zs_free */
- free_handles(pool, zspage);
+ free_handles(pool, class, zspage);
next = page = get_first_page(zspage);
do {
@@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
#ifdef CONFIG_ZPOOL
INIT_LIST_HEAD(&zspage->lru);
zspage->under_reclaim = false;
- zspage->deferred_handles = NULL;
#endif
set_freeobj(zspage, 0);
@@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(int class_size, unsigned long obj)
+static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
{
struct link_free *link;
struct zspage *zspage;
@@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj)
zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
-
- /* Insert this object in containing zspage's freelist */
link = (struct link_free *)(vaddr + f_offset);
- if (likely(!ZsHugePage(zspage)))
- link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
- else
- f_page->index = 0;
+
+ if (handle) {
+#ifdef CONFIG_ZPOOL
+ /* Stores the (deferred) handle in the object's header */
+ *handle |= OBJ_DEFERRED_HANDLE_TAG;
+ *handle &= ~OBJ_ALLOCATED_TAG;
+
+ if (likely(!ZsHugePage(zspage)))
+ link->deferred_handle = *handle;
+ else
+ f_page->index = *handle;
+#endif
+ } else {
+ /* Insert this object in containing zspage's freelist */
+ if (likely(!ZsHugePage(zspage)))
+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
+ else
+ f_page->index = 0;
+ set_freeobj(zspage, f_objidx);
+ }
+
kunmap_atomic(vaddr);
- set_freeobj(zspage, f_objidx);
mod_zspage_inuse(zspage, -1);
}
@@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);
- obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1);
#ifdef CONFIG_ZPOOL
@@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
* Reclaim needs the handles during writeback. It'll free
* them along with the zspage when it's done with them.
*
- * Record current deferred handle at the memory location
- * whose address is given by handle.
+ * Record current deferred handle in the object's header.
*/
- record_obj(handle, (unsigned long)zspage->deferred_handles);
- zspage->deferred_handles = (unsigned long *)handle;
+ obj_free(class->size, obj, &handle);
spin_unlock(&pool->lock);
return;
}
#endif
+ obj_free(class->size, obj, NULL);
+
fullness = fix_fullness_group(class, zspage);
if (fullness == ZS_EMPTY)
free_zspage(pool, class, zspage);
@@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
}
/*
- * Find alloced object in zspage from index object and
+ * Find object with a certain tag in zspage from index object and
* return handle.
*/
-static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int *obj_idx)
+static unsigned long find_tagged_obj(struct size_class *class,
+ struct page *page, int *obj_idx, int tag)
{
unsigned int offset;
int index = *obj_idx;
@@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
offset += class->size * index;
while (offset < PAGE_SIZE) {
- if (obj_allocated(page, addr + offset, &handle))
+ if (obj_tagged(page, addr + offset, &handle, tag))
break;
offset += class->size;
@@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class,
return handle;
}
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
+{
+ return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
+}
+
+#ifdef CONFIG_ZPOOL
+/*
+ * Find object storing a deferred handle in header in zspage from index object
+ * and return handle.
+ */
+static unsigned long find_deferred_handle_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
+{
+ return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
+}
+#endif
+
struct zs_compact_control {
/* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
@@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
record_obj(handle, free_obj);
- obj_free(class->size, used_obj);
+ obj_free(class->size, used_obj, NULL);
}
/* Remember last position in this iteration */
@@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool)
EXPORT_SYMBOL_GPL(zs_destroy_pool);
#ifdef CONFIG_ZPOOL
+static void restore_freelist(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
+{
+ unsigned int obj_idx = 0;
+ unsigned long handle, off = 0; /* off is within-page offset */
+ struct page *page = get_first_page(zspage);
+ struct link_free *prev_free = NULL;
+ void *prev_page_vaddr = NULL;
+
+ /* in case no free object found */
+ set_freeobj(zspage, (unsigned int)(-1UL));
+
+ while (page) {
+ void *vaddr = kmap_atomic(page);
+ struct page *next_page;
+
+ while (off < PAGE_SIZE) {
+ void *obj_addr = vaddr + off;
+
+ /* skip allocated object */
+ if (obj_allocated(page, obj_addr, &handle)) {
+ obj_idx++;
+ off += class->size;
+ continue;
+ }
+
+ /* free deferred handle from reclaim attempt */
+ if (obj_stores_deferred_handle(page, obj_addr, &handle))
+ cache_free_handle(pool, handle);
+
+ if (prev_free)
+ prev_free->next = obj_idx << OBJ_TAG_BITS;
+ else /* first free object found */
+ set_freeobj(zspage, obj_idx);
+
+ prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
+ /* if last free object in a previous page, need to unmap */
+ if (prev_page_vaddr) {
+ kunmap_atomic(prev_page_vaddr);
+ prev_page_vaddr = NULL;
+ }
+
+ obj_idx++;
+ off += class->size;
+ }
+
+ /*
+ * Handle the last (full or partial) object on this page.
+ */
+ next_page = get_next_page(page);
+ if (next_page) {
+ if (!prev_free || prev_page_vaddr) {
+ /*
+ * There is no free object in this page, so we can safely
+ * unmap it.
+ */
+ kunmap_atomic(vaddr);
+ } else {
+ /* update prev_page_vaddr since prev_free is on this page */
+ prev_page_vaddr = vaddr;
+ }
+ } else { /* this is the last page */
+ if (prev_free) {
+ /*
+ * Reset OBJ_TAG_BITS bit to last link to tell
+ * whether it's allocated object or not.
+ */
+ prev_free->next = -1UL << OBJ_TAG_BITS;
+ }
+
+ /* unmap previous page (if not done yet) */
+ if (prev_page_vaddr) {
+ kunmap_atomic(prev_page_vaddr);
+ prev_page_vaddr = NULL;
+ }
+
+ kunmap_atomic(vaddr);
+ }
+
+ page = next_page;
+ off %= PAGE_SIZE;
+ }
+}
+
static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
{
int i, obj_idx, ret = 0;
@@ -2561,6 +2728,12 @@ next:
return 0;
}
+ /*
+ * Eviction fails on one of the handles, so we need to restore zspage.
+ * We need to rebuild its freelist (and free stored deferred handles),
+ * put it back to the correct size class, and add it to the LRU list.
+ */
+ restore_freelist(pool, class, zspage);
putback_zspage(class, zspage);
list_add(&zspage->lru, &pool->lru);
unlock_zspage(zspage);