From 149562f7509404c382c32c3fa8a6ba356135e5cf Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:05 -0500 Subject: mm/hugetlb: add hugetlb_folio_subpool() helpers Allow hugetlbfs_migrate_folio to check and read subpool information by passing in a folio. Link: https://lkml.kernel.org/r/20220922154207.1575343-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dd54f67e47fd..c5137607e523 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1091,10 +1091,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - if (hugetlb_page_subpool(&src->page)) { - hugetlb_set_page_subpool(&dst->page, - hugetlb_page_subpool(&src->page)); - hugetlb_set_page_subpool(&src->page, NULL); + if (hugetlb_folio_subpool(src)) { + hugetlb_set_folio_subpool(dst, + hugetlb_folio_subpool(src)); + hugetlb_set_folio_subpool(src, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) -- cgit v1.2.3 From ece62684dcfb714b7d8452056b4a33d426b16457 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:06 -0500 Subject: hugetlbfs: convert hugetlb_delete_from_page_cache() to use folios Remove the last caller of delete_from_page_cache() by converting the code to its folio equivalent. Link: https://lkml.kernel.org/r/20220922154207.1575343-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 12 ++++++------ include/linux/pagemap.h | 1 - mm/folio-compat.c | 5 ----- 3 files changed, 6 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c5137607e523..00495fc128c5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -364,11 +364,11 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void hugetlb_delete_from_page_cache(struct page *page) +static void hugetlb_delete_from_page_cache(struct folio *folio) { - ClearPageDirty(page); - ClearPageUptodate(page); - delete_from_page_cache(page); + folio_clear_dirty(folio); + folio_clear_uptodate(folio); + filemap_remove_folio(folio); } /* @@ -574,8 +574,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, * map could fail. Correspondingly, the subpool and global * reserve usage count can need to be adjusted. */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); + VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); + hugetlb_delete_from_page_cache(folio); ret = true; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, index, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bbccb4044222..060ee98474ef 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1102,7 +1102,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); -void delete_from_page_cache(struct page *page); void __filemap_remove_folio(struct folio *folio, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index e1e23b4947d7..8ae39c06da62 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -124,11 +124,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, } EXPORT_SYMBOL(grab_cache_page_write_begin); -void delete_from_page_cache(struct page *page) -{ - return filemap_remove_folio(page_folio(page)); -} - int try_to_release_page(struct page *page, gfp_t gfp) { return filemap_release_folio(page_folio(page), gfp); -- cgit v1.2.3 From 5d89c224328bce791d051bf60aa92d90bae93c01 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:41 +0800 Subject: fs/proc/kcore.c: use hotplug_memory_notifier() directly Commit 76ae847497bc52 ("Documentation: raise minimum supported version of GCC to 5.1") updated the minimum gcc version to 5.1. So the problem mentioned in f02c69680088 ("include/linux/memory.h: implement register_hotmemory_notifier()") no longer exist. So we can now switch to use hotplug_memory_notifier() directly rather than register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-3-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index dff921f7ca33..7692a360972d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -638,10 +637,6 @@ static int __meminit kcore_callback(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block kcore_callback_nb __meminitdata = { - .notifier_call = kcore_callback, - .priority = 0, -}; static struct kcore_list kcore_vmalloc; @@ -694,7 +689,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - register_hotmemory_notifier(&kcore_callback_nb); + hotplug_memory_notifier(kcore_callback, 0); return 0; } -- cgit v1.2.3 From 1eeaa4fd39b0b1b3e986f8eab6978e69b01e3c5e Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:47 +0800 Subject: memory: move hotplug memory notifier priority to same file for easy sorting The priority of hotplug memory callback is defined in a different file. And there are some callers using numbers directly. Collect them together into include/linux/memory.h for easy reading. This allows us to sort their priorities more intuitively without additional comments. Link: https://lkml.kernel.org/r/20220923033347.3935160-9-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Christoph Lameter Cc: David Hildenbrand Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- drivers/acpi/numa/hmat.c | 2 +- fs/proc/kcore.c | 2 +- include/linux/memory-tiers.h | 1 - include/linux/memory.h | 9 +++++++-- kernel/cgroup/cpuset.c | 2 +- mm/kasan/shadow.c | 2 +- mm/ksm.c | 2 +- mm/memory-tiers.c | 2 +- mm/mm_init.c | 2 +- mm/mmap.c | 2 +- mm/page_ext.c | 2 +- 11 files changed, 16 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 0ecefb604734..139e3b41653e 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -849,7 +849,7 @@ static __init int hmat_init(void) hmat_register_targets(); /* Keep the table and structures if the notifier may use them */ - if (!hotplug_memory_notifier(hmat_callback, 2)) + if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI)) return 0; out_put: hmat_free_structures(); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 7692a360972d..98f3289556e4 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -689,7 +689,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - hotplug_memory_notifier(kcore_callback, 0); + hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 965009aa01d7..fc9647b1b4f9 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -18,7 +18,6 @@ * the same memory tier. */ #define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) -#define MEMTIER_HOTPLUG_PRIO 100 struct memory_tier; struct memory_dev_type { diff --git a/include/linux/memory.h b/include/linux/memory.h index 98d2a2ebcc10..463662ef7614 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -112,8 +112,13 @@ struct mem_section; * Priorities for the hotplug memory callback routines (stored in decreasing * order in the callback chain) */ -#define SLAB_CALLBACK_PRI 1 -#define IPC_CALLBACK_PRI 10 +#define DEFAULT_CALLBACK_PRI 0 +#define SLAB_CALLBACK_PRI 1 +#define HMAT_CALLBACK_PRI 2 +#define MM_COMPUTE_BATCH_PRI 10 +#define CPUSET_CALLBACK_PRI 10 +#define MEMTIER_HOTPLUG_PRI 100 +#define KSM_CALLBACK_PRI 100 #ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0c6db6a4f427..3ea2e836e93e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3647,7 +3647,7 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, 10); + hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 0e3648b603a6..2fba1f51f042 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -244,7 +244,7 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - hotplug_memory_notifier(kasan_mem_notifier, 0); + hotplug_memory_notifier(kasan_mem_notifier, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/mm/ksm.c b/mm/ksm.c index c19fcca9bc03..7ba97f86d831 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3211,7 +3211,7 @@ static int __init ksm_init(void) #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ - hotplug_memory_notifier(ksm_memory_callback, 100); + hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index fa8c9d07f9ce..939e200c283b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -664,7 +664,7 @@ static int __init memory_tier_init(void) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); - hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO); + hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); return 0; } subsys_initcall(memory_tier_init); diff --git a/mm/mm_init.c b/mm/mm_init.c index 44aadc162d1f..c1883362e71d 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -181,7 +181,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, static int __init mm_compute_batch_init(void) { mm_compute_batch(sysctl_overcommit_memory); - hotplug_memory_notifier(mm_compute_batch_notifier, IPC_CALLBACK_PRI); + hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 3f47fd57d165..c697771d406b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3751,7 +3751,7 @@ static int reserve_mem_notifier(struct notifier_block *nb, static int __meminit init_reserve_notifier(void) { - if (hotplug_memory_notifier(reserve_mem_notifier, 0)) + if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; diff --git a/mm/page_ext.c b/mm/page_ext.c index affe80243b6d..b2ff5c9129f4 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -513,7 +513,7 @@ void __init page_ext_init(void) cond_resched(); } } - hotplug_memory_notifier(page_ext_callback, 0); + hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; -- cgit v1.2.3 From e025ab842ec35225b1a8e163d1f311beb9e38ce9 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 18 Oct 2022 15:40:14 +0800 Subject: mm: remove kern_addr_valid() completely Most architectures (except arm64/x86/sparc) simply return 1 for kern_addr_valid(), which is only used in read_kcore(), and it calls copy_from_kernel_nofault() which could check whether the address is a valid kernel address. So as there is no need for kern_addr_valid(), let's remove it. Link: https://lkml.kernel.org/r/20221018074014.185687-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Geert Uytterhoeven [m68k] Acked-by: Heiko Carstens [s390] Acked-by: Christoph Hellwig Acked-by: Helge Deller [parisc] Acked-by: Michael Ellerman [powerpc] Acked-by: Guo Ren [csky] Acked-by: Catalin Marinas [arm64] Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Anton Ivanov Cc: Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Greg Ungerer Cc: H. Peter Anvin Cc: Huacai Chen Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Johannes Berg Cc: Jonas Bonn Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Xuerui Wang Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/include/asm/pgtable.h | 2 -- arch/arc/include/asm/pgtable-bits-arcv2.h | 2 -- arch/arm/include/asm/pgtable-nommu.h | 2 -- arch/arm/include/asm/pgtable.h | 4 --- arch/arm64/include/asm/pgtable.h | 2 -- arch/arm64/mm/mmu.c | 47 ------------------------------- arch/arm64/mm/pageattr.c | 3 +- arch/csky/include/asm/pgtable.h | 3 -- arch/hexagon/include/asm/page.h | 7 ----- arch/ia64/include/asm/pgtable.h | 16 ----------- arch/loongarch/include/asm/pgtable.h | 2 -- arch/m68k/include/asm/pgtable_mm.h | 2 -- arch/m68k/include/asm/pgtable_no.h | 1 - arch/microblaze/include/asm/pgtable.h | 3 -- arch/mips/include/asm/pgtable.h | 2 -- arch/nios2/include/asm/pgtable.h | 2 -- arch/openrisc/include/asm/pgtable.h | 2 -- arch/parisc/include/asm/pgtable.h | 15 ---------- arch/powerpc/include/asm/pgtable.h | 7 ----- arch/riscv/include/asm/pgtable.h | 2 -- arch/s390/include/asm/pgtable.h | 2 -- arch/sh/include/asm/pgtable.h | 2 -- arch/sparc/include/asm/pgtable_32.h | 6 ---- arch/sparc/mm/init_32.c | 3 +- arch/sparc/mm/init_64.c | 1 - arch/um/include/asm/pgtable.h | 2 -- arch/x86/include/asm/pgtable_32.h | 9 ------ arch/x86/include/asm/pgtable_64.h | 1 - arch/x86/mm/init_64.c | 41 --------------------------- arch/xtensa/include/asm/pgtable.h | 2 -- fs/proc/kcore.c | 26 ++++++----------- 31 files changed, 11 insertions(+), 210 deletions(-) (limited to 'fs') diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 3ea9661c09ff..9e45f6735d5d 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -313,8 +313,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - #define pte_ERROR(e) \ printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index b23be557403e..515e82db519f 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -120,8 +120,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - #ifdef CONFIG_TRANSPARENT_HUGEPAGE #include #endif diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index d16aba48fa0a..25d8c7bb07e0 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -21,8 +21,6 @@ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) -#define kern_addr_valid(addr) (1) -/* FIXME */ /* * PMD_SHIFT determines the size of the area a second-level page table can map * PGDIR_SHIFT determines what a third-level page table entry can map diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 78a532068fec..00954ab1a039 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -298,10 +298,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -/* FIXME: this is not correct */ -#define kern_addr_valid(addr) (1) - /* * We provide our own arch_get_unmapped_area to cope with VIPT caches. */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 71a1af42f0e8..4873c1d6e7d0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1021,8 +1021,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) -extern int kern_addr_valid(unsigned long addr); - #ifdef CONFIG_ARM64_MTE #define __HAVE_ARCH_PREPARE_TO_SWAP diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9a7c38965154..556154d821bf 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -814,53 +814,6 @@ void __init paging_init(void) create_idmap(); } -/* - * Check whether a kernel address is valid (derived from arch/x86/). - */ -int kern_addr_valid(unsigned long addr) -{ - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp, pud; - pmd_t *pmdp, pmd; - pte_t *ptep, pte; - - addr = arch_kasan_reset_tag(addr); - if ((((long)addr) >> VA_BITS) != -1UL) - return 0; - - pgdp = pgd_offset_k(addr); - if (pgd_none(READ_ONCE(*pgdp))) - return 0; - - p4dp = p4d_offset(pgdp, addr); - if (p4d_none(READ_ONCE(*p4dp))) - return 0; - - pudp = pud_offset(p4dp, addr); - pud = READ_ONCE(*pudp); - if (pud_none(pud)) - return 0; - - if (pud_sect(pud)) - return pfn_valid(pud_pfn(pud)); - - pmdp = pmd_offset(pudp, addr); - pmd = READ_ONCE(*pmdp); - if (pmd_none(pmd)) - return 0; - - if (pmd_sect(pmd)) - return pfn_valid(pmd_pfn(pmd)); - - ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); - if (pte_none(pte)) - return 0; - - return pfn_valid(pte_pfn(pte)); -} - #ifdef CONFIG_MEMORY_HOTPLUG static void free_hotplug_page_range(struct page *page, size_t size, struct vmem_altmap *altmap) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index d107c3d434e2..0a741a910a6a 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -201,8 +201,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) /* * This function is used to determine if a linear map page has been marked as - * not-valid. Walk the page table and check the PTE_VALID bit. This is based - * on kern_addr_valid(), which almost does what we need. + * not-valid. Walk the page table and check the PTE_VALID bit. * * Because this is only called on the kernel linear map, p?d_sect() implies * p?d_present(). When debug_pagealloc is enabled, sections mappings are diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index c3d9b92cbe61..77bc6caff2d2 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -249,9 +249,6 @@ extern void paging_init(void); void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte); -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define kern_addr_valid(addr) (1) - #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ remap_pfn_range(vma, vaddr, pfn, size, prot) diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index 7cbf719c578e..d7d4f9fca327 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -131,13 +131,6 @@ static inline void clear_page(void *page) #define page_to_virt(page) __va(page_to_phys(page)) -/* - * For port to Hexagon Virtual Machine, MAYBE we check for attempts - * to reference reserved HVM space, but in any case, the VM will be - * protected. - */ -#define kern_addr_valid(addr) (1) - #include #include /* XXX Todo: implement assembly-optimized version of getorder. */ diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 6925e28ae61d..01517a5e6778 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -181,22 +181,6 @@ ia64_phys_addr_valid (unsigned long addr) return (addr & (local_cpu_data->unimpl_pa_mask)) == 0; } -/* - * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel - * memory. For the return value to be meaningful, ADDR must be >= - * PAGE_OFFSET. This operation can be relatively expensive (e.g., - * require a hash-, or multi-level tree-lookup or something of that - * sort) but it guarantees to return TRUE only if accessing the page - * at that address does not cause an error. Note that there may be - * addresses for which kern_addr_valid() returns FALSE even though an - * access would not cause an error (e.g., this is typically true for - * memory mapped I/O regions. - * - * XXX Need to implement this for IA-64. - */ -#define kern_addr_valid(addr) (1) - - /* * Now come the defines and routines to manage and access the three-level * page table. diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 946704bee599..fc70b7041b76 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -421,8 +421,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, __update_tlb(vma, address, (pte_t *)pmdp); } -#define kern_addr_valid(addr) (1) - static inline unsigned long pmd_pfn(pmd_t pmd) { return (pmd_val(pmd) & _PFN_MASK) >> _PFN_SHIFT; diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index 9b4e2fe2ac82..b93c41fe2067 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -145,8 +145,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #endif /* !__ASSEMBLY__ */ -#define kern_addr_valid(addr) (1) - /* MMU-specific headers */ #ifdef CONFIG_SUN3 diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index bce5ca56c388..fed58da3a6b6 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -20,7 +20,6 @@ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) -#define kern_addr_valid(addr) (1) #define pmd_offset(a, b) ((void *)0) #define PAGE_NONE __pgprot(0) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index ba348e997dbb..42f5988e998b 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -416,9 +416,6 @@ extern unsigned long iopa(unsigned long addr); #define IOMAP_NOCACHE_NONSER 2 #define IOMAP_NO_COPYBACK 3 -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define kern_addr_valid(addr) (1) - void do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code); diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 6caec386ad2f..364a06033105 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -550,8 +550,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, __update_tlb(vma, address, pte); } -#define kern_addr_valid(addr) (1) - /* * Allow physical addresses to be fixed up to help 36-bit peripherals. */ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index b3d45e815295..ab793bc517f5 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -249,8 +249,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define kern_addr_valid(addr) (1) - extern void __init paging_init(void); extern void __init mmu_init(void); diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index dcae8aea132f..6477c17b3062 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -395,8 +395,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - typedef pte_t *pte_addr_t; #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index ecd028854469..bd09a44cfb2d 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -23,21 +23,6 @@ #include #include -/* - * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel - * memory. For the return value to be meaningful, ADDR must be >= - * PAGE_OFFSET. This operation can be relatively expensive (e.g., - * require a hash-, or multi-level tree-lookup or something of that - * sort) but it guarantees to return TRUE only if accessing the page - * at that address does not cause an error. Note that there may be - * addresses for which kern_addr_valid() returns FALSE even though an - * access would not cause an error (e.g., this is typically true for - * memory mapped I/O regions. - * - * XXX Need to implement this for parisc. - */ -#define kern_addr_valid(addr) (1) - /* This is for the serialization of PxTLB broadcasts. At least on the N class * systems, only one PxTLB inter processor broadcast can be active at any one * time on the Merced bus. */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 283f40d05a4d..9972626ddaf6 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -81,13 +81,6 @@ void poking_init(void); extern unsigned long ioremap_bot; extern const pgprot_t protection_map[16]; -/* - * kern_addr_valid is intended to indicate whether an address is a valid - * kernel address. Most 32-bit archs define it as always true (like this) - * but most 64-bit archs actually perform a test. What should we do here? - */ -#define kern_addr_valid(addr) (1) - #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 #endif diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7ec936910a96..c7993bdf749f 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -801,8 +801,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #endif /* !CONFIG_MMU */ -#define kern_addr_valid(addr) (1) /* FIXME */ - extern char _start[]; extern void *_dtb_early_va; extern uintptr_t _dtb_early_pa; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index f1cb9391190d..e1db07211818 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1773,8 +1773,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - extern int vmem_add_mapping(unsigned long start, unsigned long size); extern void vmem_remove_mapping(unsigned long start, unsigned long size); extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 6fb9ec54cf9b..3ce30becf6df 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -92,8 +92,6 @@ static inline unsigned long phys_addr_mask(void) typedef pte_t *pte_addr_t; -#define kern_addr_valid(addr) (1) - #define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) struct vm_area_struct; diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 8ff549004fac..5acc05b572e6 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -368,12 +368,6 @@ __get_iospace (unsigned long addr) } } -extern unsigned long *sparc_valid_addr_bitmap; - -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define kern_addr_valid(addr) \ - (test_bit(__pa((unsigned long)(addr))>>20, sparc_valid_addr_bitmap)) - /* * For sparc32&64, the pfn in io_remap_pfn_range() carries in * its high 4 bits. These macros/functions put it there or get it from there. diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index d88e774c8eb4..9c0ea457bdf0 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -37,8 +37,7 @@ #include "mm_32.h" -unsigned long *sparc_valid_addr_bitmap; -EXPORT_SYMBOL(sparc_valid_addr_bitmap); +static unsigned long *sparc_valid_addr_bitmap; unsigned long phys_base; EXPORT_SYMBOL(phys_base); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index d6faee23c77d..04f9db0c3111 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1667,7 +1667,6 @@ bool kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -EXPORT_SYMBOL(kern_addr_valid); static unsigned long __ref kernel_map_hugepud(unsigned long vstart, unsigned long vend, diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 66bc3f99d9be..4e3052f2671a 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -298,8 +298,6 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - /* Clear a kernel PTE and flush it from the TLB */ #define kpte_clear_flush(ptep, vaddr) \ do { \ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 7c9c968a42ef..7d4ad8907297 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -47,15 +47,6 @@ do { \ #endif /* !__ASSEMBLY__ */ -/* - * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM - */ -#ifdef CONFIG_FLATMEM -#define kern_addr_valid(addr) (1) -#else -#define kern_addr_valid(kaddr) (0) -#endif - /* * This is used to calculate the .brk reservation for initial pagetables. * Enough space is reserved to allocate pagetables sufficient to cover all diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e479491da8d5..7929327abe00 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -240,7 +240,6 @@ static inline void native_pgd_clear(pgd_t *pgd) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) #define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val }) -extern int kern_addr_valid(unsigned long addr); extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3f040c6e5d13..e8db4edd7cc9 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1416,47 +1416,6 @@ void mark_rodata_ro(void) debug_checkwx(); } -int kern_addr_valid(unsigned long addr) -{ - unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - if (above != 0 && above != -1UL) - return 0; - - pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) - return 0; - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) - return 0; - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) - return 0; - - if (pud_large(*pud)) - return pfn_valid(pud_pfn(*pud)); - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - return 0; - - if (pmd_large(*pmd)) - return pfn_valid(pmd_pfn(*pmd)); - - pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) - return 0; - - return pfn_valid(pte_pfn(*pte)); -} - /* * Block size is the minimum amount of memory which can be hotplugged or * hotremoved. It must be power of two and must be equal or larger than diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 54f577c13afa..5b5484d707b2 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -386,8 +386,6 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) #else -#define kern_addr_valid(addr) (1) - extern void update_mmu_cache(struct vm_area_struct * vma, unsigned long address, pte_t *ptep); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 98f3289556e4..71157ee35c1a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -540,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) fallthrough; case KCORE_VMEMMAP: case KCORE_TEXT: - if (kern_addr_valid(start)) { - /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. - */ - if (copy_from_kernel_nofault(buf, (void *)start, - tsz)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else { - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; - } + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; } } else { - if (clear_user(buffer, tsz)) { + if (copy_to_user(buffer, buf, tsz)) { ret = -EFAULT; goto out; } -- cgit v1.2.3 From 26215b7ee923b9251f7bb12c4e5f09dc465d35f2 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Fri, 21 Oct 2022 07:16:08 +0800 Subject: hugetlbfs: fix null-ptr-deref in hugetlbfs_parse_param() Syzkaller reports a null-ptr-deref bug as follows: ====================================================== KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:hugetlbfs_parse_param+0x1dd/0x8e0 fs/hugetlbfs/inode.c:1380 [...] Call Trace: vfs_parse_fs_param fs/fs_context.c:148 [inline] vfs_parse_fs_param+0x1f9/0x3c0 fs/fs_context.c:129 vfs_parse_fs_string+0xdb/0x170 fs/fs_context.c:191 generic_parse_monolithic+0x16f/0x1f0 fs/fs_context.c:231 do_new_mount fs/namespace.c:3036 [inline] path_mount+0x12de/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] ====================================================== According to commit "vfs: parse: deal with zero length string value", kernel will set the param->string to null pointer in vfs_parse_fs_string() if fs string has zero length. Yet the problem is that, hugetlbfs_parse_param() will dereference the param->string, without checking whether it is a null pointer. To be more specific, if hugetlbfs_parse_param() parses an illegal mount parameter, such as "size=,", kernel will constructs struct fs_parameter with null pointer in vfs_parse_fs_string(), then passes this struct fs_parameter to hugetlbfs_parse_param(), which triggers the above null-ptr-deref bug. This patch solves it by adding sanity check on param->string in hugetlbfs_parse_param(). Link: https://lkml.kernel.org/r/20221020231609.4810-1-yin31149@gmail.com Reported-by: syzbot+a3e6acd85ded5c16a709@syzkaller.appspotmail.com Tested-by: syzbot+a3e6acd85ded5c16a709@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005ad00405eb7148c6@google.com/ Signed-off-by: Hawkins Jiawei Reviewed-by: Mike Kravetz Cc: Hawkins Jiawei Cc: Muchun Song Cc: Ian Kent Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 00495fc128c5..09e644f80a4a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1378,7 +1378,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_size: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->max_size_opt = memparse(param->string, &rest); ctx->max_val_type = SIZE_STD; @@ -1388,7 +1388,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_nr_inodes: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->nr_inodes = memparse(param->string, &rest); return 0; @@ -1404,7 +1404,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_min_size: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->min_size_opt = memparse(param->string, &rest); ctx->min_val_type = SIZE_STD; -- cgit v1.2.3 From d09e8ca6cb93bb4b97517a18fbbf7eccb0e9ff43 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 15 Nov 2022 02:06:01 +0000 Subject: mm: anonymous shared memory naming Since commit 9a10064f5625 ("mm: add a field to store names for private anonymous memory"), name for private anonymous memory, but not shared anonymous, can be set. However, naming shared anonymous memory just as useful for tracking purposes. Extend the functionality to be able to set names for shared anon. There are two ways to create anonymous shared memory, using memfd or directly via mmap(): 1. fd = memfd_create(...) mem = mmap(..., MAP_SHARED, fd, ...) 2. mem = mmap(..., MAP_SHARED | MAP_ANONYMOUS, -1, ...) In both cases the anonymous shared memory is created the same way by mapping an unlinked file on tmpfs. The memfd way allows to give a name for anonymous shared memory, but not useful when parts of shared memory require to have distinct names. Example use case: The VMM maps VM memory as anonymous shared memory (not private because VMM is sandboxed and drivers are running in their own processes). However, the VM tells back to the VMM how parts of the memory are actually used by the guest, how each of the segments should be backed (i.e. 4K pages, 2M pages), and some other information about the segments. The naming allows us to monitor the effective memory footprint for each of these segments from the host without looking inside the guest. Sample output: /* Create shared anonymous segmenet */ anon_shmem = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); /* Name the segment: "MY-NAME" */ rv = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, anon_shmem, SIZE, "MY-NAME"); cat /proc//maps (and smaps): 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 [anon_shmem:MY-NAME] If the segment is not named, the output is: 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 /dev/zero (deleted) Link: https://lkml.kernel.org/r/20221115020602.804224-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Hildenbrand Cc: Arnd Bergmann Cc: Bagas Sanjaya Cc: Colin Cross Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Paul Gortmaker Cc: Peter Xu Cc: Sean Christopherson Cc: Vincent Whitchurch Cc: Vlastimil Babka Cc: xu xin Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 8 +++++--- fs/proc/task_mmu.c | 15 +++++++++++---- include/linux/mm.h | 2 ++ include/linux/mm_types.h | 26 ++++++++++++-------------- mm/madvise.c | 7 ++----- mm/shmem.c | 29 +++++++++++++++++++++++++---- 6 files changed, 57 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 898c99eae8e4..b8f175ae4853 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -426,14 +426,16 @@ with the memory region, as the case would be with BSS (uninitialized data). The "pathname" shows the name associated file for this mapping. If the mapping is not associated with a file: - ============= ==================================== + =================== =========================================== [heap] the heap of the program [stack] the stack of the main process [vdso] the "virtual dynamic shared object", the kernel system call handler - [anon:] an anonymous mapping that has been + [anon:] a private anonymous mapping that has been named by userspace - ============= ==================================== + [anon_shmem:] an anonymous shared memory mapping that has + been named by userspace + =================== =========================================== or if empty, the mapping is anonymous. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a74cdcc9af0..89338950afd3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -274,6 +274,7 @@ static void show_vma_header_prefix(struct seq_file *m, static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { + struct anon_vma_name *anon_name = NULL; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; vm_flags_t flags = vma->vm_flags; @@ -293,6 +294,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); + if (mm) + anon_name = anon_vma_name(vma); /* * Print the dentry name for named mappings, and a @@ -300,7 +303,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) */ if (file) { seq_pad(m, ' '); - seq_file_path(m, file, "\n"); + /* + * If user named this anon shared memory via + * prctl(PR_SET_VMA ..., use the provided name. + */ + if (anon_name) + seq_printf(m, "[anon_shmem:%s]", anon_name->name); + else + seq_file_path(m, file, "\n"); goto done; } @@ -312,8 +322,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { - struct anon_vma_name *anon_name; - if (!mm) { name = "[vdso]"; goto done; @@ -330,7 +338,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - anon_name = anon_vma_name(vma); if (anon_name) { seq_pad(m, ' '); seq_printf(m, "[anon:%s]", anon_name->name); diff --git a/include/linux/mm.h b/include/linux/mm.h index f873441303b7..686879dbb0bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -700,8 +700,10 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); +bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b0009e7d4ae..157c2e22cc7f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -549,21 +549,11 @@ struct vm_area_struct { * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. * - * For private anonymous mappings, a pointer to a null terminated string - * containing the name given to the vma, or NULL if unnamed. */ - - union { - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* - * Serialized by mmap_sem. Never use directly because it is - * valid only when vm_file is NULL. Use anon_vma_name instead. - */ - struct anon_vma_name *anon_name; - }; + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -584,6 +574,14 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_sem. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif diff --git a/mm/madvise.c b/mm/madvise.c index b913ba6efc10..83b0c91a126b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -95,9 +95,6 @@ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); - if (vma->vm_file) - return NULL; - return vma->anon_name; } @@ -183,7 +180,7 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; - if (!vma->vm_file) { + if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; @@ -1273,7 +1270,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, int error; /* Only anonymous mappings can be named */ - if (vma->vm_file) + if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, diff --git a/mm/shmem.c b/mm/shmem.c index 7428ae3fa4b9..f418d21205be 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -237,11 +237,17 @@ static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; +bool vma_is_anon_shmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_anon_vm_ops; +} + bool vma_is_shmem(struct vm_area_struct *vma) { - return vma->vm_ops == &shmem_vm_ops; + return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); @@ -2263,7 +2269,8 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { - struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); int ret; ret = seal_check_future_write(info->seals, vma); @@ -2274,7 +2281,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_MTE_ALLOWED; file_accessed(file); - vma->vm_ops = &shmem_vm_ops; + /* This is anonymous shared memory if it is unlinked at the time of mmap */ + if (inode->i_nlink) + vma->vm_ops = &shmem_vm_ops; + else + vma->vm_ops = &shmem_anon_vm_ops; return 0; } @@ -3988,6 +3999,15 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; +static const struct vm_operations_struct shmem_anon_vm_ops = { + .fault = shmem_fault, + .map_pages = filemap_map_pages, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; @@ -4163,6 +4183,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops +#define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 @@ -4268,7 +4289,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; - vma->vm_ops = &shmem_vm_ops; + vma->vm_ops = &shmem_anon_vm_ops; return 0; } -- cgit v1.2.3 From dbaf7dc97ab8d526a20d3477419bc14b4890a82c Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 7 Nov 2022 09:56:59 +0800 Subject: hugetlbfs: inode: remove unnecessary (void*) conversions The ei pointer does not need to cast the type. Link: https://lkml.kernel.org/r/20221107015659.3221-1-zeming@nfschina.com Signed-off-by: Li zeming Reviewed-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3ee84604e36d..790d2727141a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1279,7 +1279,7 @@ static const struct address_space_operations hugetlbfs_aops = { static void init_once(void *foo) { - struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; + struct hugetlbfs_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } -- cgit v1.2.3 From 6dd8fe86fa84729538d8bed3149faf9c5886bb5b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:52 -0800 Subject: ext4: convert move_extent_per_page() to use folios Patch series "Removing the try_to_release_page() wrapper", v3. This patchset replaces the remaining calls of try_to_release_page() with the folio equivalent: filemap_release_folio(). This allows us to remove the wrapper. This patch (of 4): Convert move_extent_per_page() to use folios. This change removes 5 calls to compound_head() and is in preparation for the removal of the try_to_release_page() wrapper. Link: https://lkml.kernel.org/r/20221118073055.55694-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221118073055.55694-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/move_extent.c | 52 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 044e34cd835c..8dbb87edf24c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -253,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; + struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; @@ -313,6 +314,13 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ + folio[0] = page_folio(pagep[0]); + folio[1] = page_folio(pagep[1]); + + VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); + VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); + VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); + if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -331,10 +339,10 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((page_has_private(pagep[0]) && - !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && - !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; goto drop_data_sem; } @@ -344,19 +352,21 @@ again: block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); - goto unlock_pages; + goto unlock_folios; } data_copy: - *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); + *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); if (*err) - goto unlock_pages; + goto unlock_folios; /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; - goto unlock_pages; + goto unlock_folios; } ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, @@ -369,13 +379,13 @@ data_copy: replaced_size = block_len_in_page << orig_inode->i_blkbits; } else - goto unlock_pages; + goto unlock_folios; } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - if (!page_has_buffers(pagep[0])) - create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0); - bh = page_buffers(pagep[0]); + if (!folio_buffers(folio[0])) + create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); + bh = folio_buffers(folio[0]); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { @@ -385,7 +395,7 @@ data_copy: bh = bh->b_this_page; } if (!*err) - *err = block_commit_write(pagep[0], from, from + replaced_size); + *err = block_commit_write(&folio[0]->page, from, from + replaced_size); if (unlikely(*err < 0)) goto repair_branches; @@ -395,11 +405,11 @@ data_copy: *err = ext4_jbd2_inode_add_write(handle, orig_inode, (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); -unlock_pages: - unlock_page(pagep[0]); - put_page(pagep[0]); - unlock_page(pagep[1]); - put_page(pagep[1]); +unlock_folios: + folio_unlock(folio[0]); + folio_put(folio[0]); + folio_unlock(folio[1]); + folio_put(folio[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && @@ -430,7 +440,7 @@ repair_branches: *err = -EIO; } replaced_count = 0; - goto unlock_pages; + goto unlock_folios; } /** -- cgit v1.2.3 From 3720dd6dcac38d03424d6ba38107f39af5318bcf Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:22 -0700 Subject: filemap: convert replace_page_cache_page() to replace_page_cache_folio() Patch series "Removing the lru_cache_add() wrapper". This patchset replaces all calls of lru_cache_add() with the folio equivalent: folio_add_lru(). This is allows us to get rid of the wrapper The series passes xfstests and the userfaultfd selftests. This patch (of 5): Eliminates 7 calls to compound_head(). Link: https://lkml.kernel.org/r/20221101175326.13265-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221101175326.13265-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- fs/fuse/dev.c | 2 +- include/linux/pagemap.h | 2 +- mm/filemap.c | 52 ++++++++++++++++++++++++------------------------- 3 files changed, 27 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4a6e0a1b945..26817a2db463 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -837,7 +837,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (WARN_ON(PageMlocked(oldpage))) goto out_fallback_unlock; - replace_page_cache_page(oldpage, newpage); + replace_page_cache_folio(page_folio(oldpage), page_folio(newpage)); get_page(newpage); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2ec0ca1f3d38..29e1f9e76eb6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1102,7 +1102,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); void __filemap_remove_folio(struct folio *folio, void *shadow); -void replace_page_cache_page(struct page *old, struct page *new); +void replace_page_cache_folio(struct folio *old, struct folio *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); bool filemap_release_folio(struct folio *folio, gfp_t gfp); diff --git a/mm/filemap.c b/mm/filemap.c index 242cd8bd8330..c4d4ace9cc70 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -788,56 +788,54 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) EXPORT_SYMBOL(file_write_and_wait_range); /** - * replace_page_cache_page - replace a pagecache page with a new one - * @old: page to be replaced - * @new: page to replace with - * - * This function replaces a page in the pagecache with a new one. On - * success it acquires the pagecache reference for the new page and - * drops it for the old page. Both the old and new pages must be - * locked. This function does not add the new page to the LRU, the + * replace_page_cache_folio - replace a pagecache folio with a new one + * @old: folio to be replaced + * @new: folio to replace with + * + * This function replaces a folio in the pagecache with a new one. On + * success it acquires the pagecache reference for the new folio and + * drops it for the old folio. Both the old and new folios must be + * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ -void replace_page_cache_page(struct page *old, struct page *new) +void replace_page_cache_folio(struct folio *old, struct folio *new) { - struct folio *fold = page_folio(old); - struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); - VM_BUG_ON_PAGE(!PageLocked(old), old); - VM_BUG_ON_PAGE(!PageLocked(new), new); - VM_BUG_ON_PAGE(new->mapping, new); + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(new->mapping, new); - get_page(new); + folio_get(new); new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(fold, fnew); + mem_cgroup_migrate(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ - if (!PageHuge(old)) - __dec_lruvec_page_state(old, NR_FILE_PAGES); - if (!PageHuge(new)) - __inc_lruvec_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(old)) - __dec_lruvec_page_state(old, NR_SHMEM); - if (PageSwapBacked(new)) - __inc_lruvec_page_state(new, NR_SHMEM); + if (!folio_test_hugetlb(old)) + __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + if (!folio_test_hugetlb(new)) + __lruvec_stat_add_folio(new, NR_FILE_PAGES); + if (folio_test_swapbacked(old)) + __lruvec_stat_sub_folio(old, NR_SHMEM); + if (folio_test_swapbacked(new)) + __lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) - free_folio(fold); - folio_put(fold); + free_folio(old); + folio_put(old); } -EXPORT_SYMBOL_GPL(replace_page_cache_page); +EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) -- cgit v1.2.3 From 063aaad792eef49a11d7575dc9914b43c0fa3792 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:23 -0700 Subject: fuse: convert fuse_try_move_page() to use folios Converts the function to try to move folios instead of pages. Also converts fuse_check_page() to fuse_get_folio() since this is its only caller. This change removes 15 calls to compound_head(). Link: https://lkml.kernel.org/r/20221101175326.13265-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Miklos Szeredi Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- fs/fuse/dev.c | 55 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 26817a2db463..204c332cd343 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -764,11 +764,11 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) return ncpy; } -static int fuse_check_page(struct page *page) +static int fuse_check_folio(struct folio *folio) { - if (page_mapcount(page) || - page->mapping != NULL || - (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + if (folio_mapped(folio) || + folio->mapping != NULL || + (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_uptodate | @@ -778,7 +778,7 @@ static int fuse_check_page(struct page *page) 1 << PG_reclaim | 1 << PG_waiters | LRU_GEN_MASK | LRU_REFS_MASK))) { - dump_page(page, "fuse: trying to steal weird page"); + dump_page(&folio->page, "fuse: trying to steal weird page"); return 1; } return 0; @@ -787,11 +787,11 @@ static int fuse_check_page(struct page *page) static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) { int err; - struct page *oldpage = *pagep; - struct page *newpage; + struct folio *oldfolio = page_folio(*pagep); + struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; - get_page(oldpage); + folio_get(oldfolio); err = unlock_request(cs->req); if (err) goto out_put_old; @@ -814,35 +814,36 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (!pipe_buf_try_steal(cs->pipe, buf)) goto out_fallback; - newpage = buf->page; + newfolio = page_folio(buf->page); - if (!PageUptodate(newpage)) - SetPageUptodate(newpage); + if (!folio_test_uptodate(newfolio)) + folio_mark_uptodate(newfolio); - ClearPageMappedToDisk(newpage); + folio_clear_mappedtodisk(newfolio); - if (fuse_check_page(newpage) != 0) + if (fuse_check_folio(newfolio) != 0) goto out_fallback_unlock; /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it */ - if (WARN_ON(page_mapped(oldpage))) + if (WARN_ON(folio_mapped(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(page_has_private(oldpage))) + if (WARN_ON(folio_has_private(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + if (WARN_ON(folio_test_dirty(oldfolio) || + folio_test_writeback(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(PageMlocked(oldpage))) + if (WARN_ON(folio_test_mlocked(oldfolio))) goto out_fallback_unlock; - replace_page_cache_folio(page_folio(oldpage), page_folio(newpage)); + replace_page_cache_folio(oldfolio, newfolio); - get_page(newpage); + folio_get(newfolio); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(newpage); + folio_add_lru(newfolio); /* * Release while we have extra ref on stolen page. Otherwise @@ -855,28 +856,28 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else - *pagep = newpage; + *pagep = &newfolio->page; spin_unlock(&cs->req->waitq.lock); if (err) { - unlock_page(newpage); - put_page(newpage); + folio_unlock(newfolio); + folio_put(newfolio); goto out_put_old; } - unlock_page(oldpage); + folio_unlock(oldfolio); /* Drop ref for ap->pages[] array */ - put_page(oldpage); + folio_put(oldfolio); cs->len = 0; err = 0; out_put_old: /* Drop ref obtained in this function */ - put_page(oldpage); + folio_put(oldfolio); return err; out_fallback_unlock: - unlock_page(newpage); + folio_unlock(newfolio); out_fallback: cs->pg = buf->page; cs->offset = buf->offset; -- cgit v1.2.3 From 169004265860327182ecf92297b25b6271e81e96 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:51 +0000 Subject: fsdax: introduce page->share for fsdax in reflink mode Patch series "fsdax,xfs: fix warning messages", v2. Many testcases failed in dax+reflink mode with warning message in dmesg. Such as generic/051,075,127. The warning message is like this: [ 775.509337] ------------[ cut here ]------------ [ 775.509636] WARNING: CPU: 1 PID: 16815 at fs/dax.c:386 dax_insert_entry.cold+0x2e/0x69 [ 775.510151] Modules linked in: auth_rpcgss oid_registry nfsv4 algif_hash af_alg af_packet nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink ip6table_filter ip6_tables iptable_filter ip_tables x_tables dax_pmem nd_pmem nd_btt sch_fq_codel configfs xfs libcrc32c fuse [ 775.524288] CPU: 1 PID: 16815 Comm: fsx Kdump: loaded Tainted: G W 6.1.0-rc4+ #164 eb34e4ee4200c7cbbb47de2b1892c5a3e027fd6d [ 775.524904] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.0-3-3 04/01/2014 [ 775.525460] RIP: 0010:dax_insert_entry.cold+0x2e/0x69 [ 775.525797] Code: c7 c7 18 eb e0 81 48 89 4c 24 20 48 89 54 24 10 e8 73 6d ff ff 48 83 7d 18 00 48 8b 54 24 10 48 8b 4c 24 20 0f 84 e3 e9 b9 ff <0f> 0b e9 dc e9 b9 ff 48 c7 c6 a0 20 c3 81 48 c7 c7 f0 ea e0 81 48 [ 775.526708] RSP: 0000:ffffc90001d57b30 EFLAGS: 00010082 [ 775.527042] RAX: 000000000000002a RBX: 0000000000000000 RCX: 0000000000000042 [ 775.527396] RDX: ffffea000a0f6c80 RSI: ffffffff81dfab1b RDI: 00000000ffffffff [ 775.527819] RBP: ffffea000a0f6c40 R08: 0000000000000000 R09: ffffffff820625e0 [ 775.528241] R10: ffffc90001d579d8 R11: ffffffff820d2628 R12: ffff88815fc98320 [ 775.528598] R13: ffffc90001d57c18 R14: 0000000000000000 R15: 0000000000000001 [ 775.528997] FS: 00007f39fc75d740(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000 [ 775.529474] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 775.529800] CR2: 00007f39fc772040 CR3: 0000000107eb6001 CR4: 00000000003706e0 [ 775.530214] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 775.530592] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 775.531002] Call Trace: [ 775.531230] [ 775.531444] dax_fault_iter+0x267/0x6c0 [ 775.531719] dax_iomap_pte_fault+0x198/0x3d0 [ 775.532002] __xfs_filemap_fault+0x24a/0x2d0 [xfs aa8d25411432b306d9554da38096f4ebb86bdfe7] [ 775.532603] __do_fault+0x30/0x1e0 [ 775.532903] do_fault+0x314/0x6c0 [ 775.533166] __handle_mm_fault+0x646/0x1250 [ 775.533480] handle_mm_fault+0xc1/0x230 [ 775.533810] do_user_addr_fault+0x1ac/0x610 [ 775.534110] exc_page_fault+0x63/0x140 [ 775.534389] asm_exc_page_fault+0x22/0x30 [ 775.534678] RIP: 0033:0x7f39fc55820a [ 775.534950] Code: 00 01 00 00 00 74 99 83 f9 c0 0f 87 7b fe ff ff c5 fe 6f 4e 20 48 29 fe 48 83 c7 3f 49 8d 0c 10 48 83 e7 c0 48 01 fe 48 29 f9 a4 c4 c1 7e 7f 00 c4 c1 7e 7f 48 20 c5 f8 77 c3 0f 1f 44 00 00 [ 775.535839] RSP: 002b:00007ffc66a08118 EFLAGS: 00010202 [ 775.536157] RAX: 00007f39fc772001 RBX: 0000000000042001 RCX: 00000000000063c1 [ 775.536537] RDX: 0000000000006400 RSI: 00007f39fac42050 RDI: 00007f39fc772040 [ 775.536919] RBP: 0000000000006400 R08: 00007f39fc772001 R09: 0000000000042000 [ 775.537304] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001 [ 775.537694] R13: 00007f39fc772000 R14: 0000000000006401 R15: 0000000000000003 [ 775.538086] [ 775.538333] ---[ end trace 0000000000000000 ]--- This also affects dax+noreflink mode if we run the test after a dax+reflink test. So, the most urgent thing is solving the warning messages. With these fixes, most warning messages in dax_associate_entry() are gone. But honestly, generic/388 will randomly failed with the warning. The case shutdown the xfs when fsstress is running, and do it for many times. I think the reason is that dax pages in use are not able to be invalidated in time when fs is shutdown. The next time dax page to be associated, it still remains the mapping value set last time. I'll keep on solving it. The warning message in dax_writeback_one() can also be fixed because of the dax unshare. This patch (of 8): fsdax page is used not only when CoW, but also mapread. To make the it easily understood, use 'share' to indicate that the dax page is shared by more than one extent. And add helper functions to use it. Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED. [ruansy.fnst@fujitsu.com: rename several functions] Link: https://lkml.kernel.org/r/1669972991-246-1-git-send-email-ruansy.fnst@fujitsu.com [ruansy.fnst@fujitsu.com: v2.2] Link: https://lkml.kernel.org/r/1670381359-53-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-2-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: Alistair Popple Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 38 ++++++++++++++++++++++---------------- include/linux/mm_types.h | 5 ++++- include/linux/page-flags.h | 2 +- 3 files changed, 27 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 1c6867810cbd..84fadea08705 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) -static inline bool dax_mapping_is_cow(struct address_space *mapping) +static inline bool dax_page_is_shared(struct page *page) { - return (unsigned long)mapping == PAGE_MAPPING_DAX_COW; + return page->mapping == PAGE_MAPPING_DAX_SHARED; } /* - * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount. + * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the + * refcount. */ -static inline void dax_mapping_set_cow(struct page *page) +static inline void dax_page_share_get(struct page *page) { - if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) { + if (page->mapping != PAGE_MAPPING_DAX_SHARED) { /* * Reset the index if the page was already mapped * regularly before. */ if (page->mapping) - page->index = 1; - page->mapping = (void *)PAGE_MAPPING_DAX_COW; + page->share = 1; + page->mapping = PAGE_MAPPING_DAX_SHARED; } - page->index++; + page->share++; +} + +static inline unsigned long dax_page_share_put(struct page *page) +{ + return --page->share; } /* - * When it is called in dax_insert_entry(), the cow flag will indicate that + * When it is called in dax_insert_entry(), the shared flag will indicate that * whether this entry is shared by multiple files. If so, set the page->mapping - * FS_DAX_MAPPING_COW, and use page->index as refcount. + * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address, bool cow) + struct vm_area_struct *vma, unsigned long address, bool shared) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; @@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - if (cow) { - dax_mapping_set_cow(page); + if (shared) { + dax_page_share_get(page); } else { WARN_ON_ONCE(page->mapping); page->mapping = mapping; @@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - if (dax_mapping_is_cow(page->mapping)) { - /* keep the CoW flag if this page is still shared */ - if (page->index-- > 0) + if (dax_page_is_shared(page)) { + /* keep the shared flag if this page is still shared */ + if (dax_page_share_put(page) > 0) continue; } else WARN_ON_ONCE(page->mapping && page->mapping != mapping); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 199f98be6f9c..3b8475007734 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -104,7 +104,10 @@ struct page { }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; - pgoff_t index; /* Our offset within mapping. */ + union { + pgoff_t index; /* Our offset within mapping. */ + unsigned long share; /* share count for fsdax */ + }; /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e42c55a7e012..9aec9fd8c50b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -638,7 +638,7 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) * Different with flags above, this flag is used only for fsdax mode. It * indicates that this page->mapping is now under reflink case. */ -#define PAGE_MAPPING_DAX_COW 0x1 +#define PAGE_MAPPING_DAX_SHARED ((void *)0x1) static __always_inline bool folio_mapping_flags(struct folio *folio) { -- cgit v1.2.3 From f80e1668888f34c0764822e74953c997daf2ccdb Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:52 +0000 Subject: fsdax: invalidate pages when CoW CoW changes the share state of a dax page, but the share count of the page isn't updated. The next time access this page, it should have been a newly accessed, but old association exists. So, we need to clear the share state when CoW happens, in both dax_iomap_rw() and dax_zero_iter(). Link: https://lkml.kernel.org/r/1669908538-55-3-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 84fadea08705..c975d075e77b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1264,6 +1264,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) return length; + /* + * invalidate the pages whose sharing state is to be changed + * because of CoW. + */ + if (iomap->flags & IOMAP_F_SHARED) + invalidate_inode_pages2_range(iter->inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + length - 1) >> PAGE_SHIFT); + do { unsigned offset = offset_in_page(pos); unsigned size = min_t(u64, PAGE_SIZE - offset, length); @@ -1324,12 +1333,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; - const struct iomap *srcmap = &iomi->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iomi); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; bool write = iov_iter_rw(iter) == WRITE; + bool cow = write && iomap->flags & IOMAP_F_SHARED; ssize_t ret = 0; size_t xfer; int id; @@ -1356,7 +1366,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if (iomap->flags & IOMAP_F_NEW) { + if (iomap->flags & IOMAP_F_NEW || cow) { invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); @@ -1390,8 +1400,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, break; } - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { + if (cow) { ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, kaddr); if (ret) -- cgit v1.2.3 From 708dfad2eb4169324189782edd6d3763237e0489 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:53 +0000 Subject: fsdax: zero the edges if source is HOLE or UNWRITTEN If srcmap contains invalid data, such as HOLE and UNWRITTEN, the dest page should be zeroed. Otherwise, since it's a pmem, old data may remains on the dest page, the result of CoW will be incorrect. The function name is also not easy to understand, rename it to "dax_iomap_copy_around()", which means it copies data around the range. [akpm@linux-foundation.org: update dax_iomap_copy_around() kerneldoc, per Darrick] Link: https://lkml.kernel.org/r/1669973145-318-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-4-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 79 ++++++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index c975d075e77b..359b958eb835 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1092,7 +1092,8 @@ out: } /** - * dax_iomap_cow_copy - Copy the data from source to destination before write + * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page + * by copying the data before and after the range to be written. * @pos: address to do copy from. * @length: size of copy operation. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) @@ -1101,35 +1102,50 @@ out: * * This can be called from two places. Either during DAX write fault (page * aligned), to copy the length size data to daddr. Or, while doing normal DAX - * write operation, dax_iomap_actor() might call this to do the copy of either + * write operation, dax_iomap_iter() might call this to do the copy of either * start or end unaligned address. In the latter case the rest of the copy of - * aligned ranges is taken care by dax_iomap_actor() itself. + * aligned ranges is taken care by dax_iomap_iter() itself. + * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the + * area to make sure no old data remains. */ -static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, +static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, const struct iomap *srcmap, void *daddr) { loff_t head_off = pos & (align_size - 1); size_t size = ALIGN(head_off + length, align_size); loff_t end = pos + length; loff_t pg_end = round_up(end, align_size); + /* copy_all is usually in page fault case */ bool copy_all = head_off == 0 && end == pg_end; + /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ + bool zero_edge = srcmap->flags & IOMAP_F_SHARED || + srcmap->type == IOMAP_UNWRITTEN; void *saddr = 0; int ret = 0; - ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); - if (ret) - return ret; + if (!zero_edge) { + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); + if (ret) + return ret; + } if (copy_all) { - ret = copy_mc_to_kernel(daddr, saddr, length); - return ret ? -EIO : 0; + if (zero_edge) + memset(daddr, 0, size); + else + ret = copy_mc_to_kernel(daddr, saddr, length); + goto out; } /* Copy the head part of the range */ if (head_off) { - ret = copy_mc_to_kernel(daddr, saddr, head_off); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr, 0, head_off); + else { + ret = copy_mc_to_kernel(daddr, saddr, head_off); + if (ret) + return -EIO; + } } /* Copy the tail part of the range */ @@ -1137,12 +1153,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, loff_t tail_off = head_off + length; loff_t tail_len = pg_end - end; - ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off, - tail_len); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr + tail_off, 0, tail_len); + else { + ret = copy_mc_to_kernel(daddr + tail_off, + saddr + tail_off, tail_len); + if (ret) + return -EIO; + } } - return 0; +out: + if (zero_edge) + dax_flush(srcmap->dax_dev, daddr, size); + return ret ? -EIO : 0; } /* @@ -1241,13 +1264,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) if (ret < 0) return ret; memset(kaddr + offset, 0, size); - if (srcmap->addr != iomap->addr) { - ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap, - kaddr); - if (ret < 0) - return ret; - dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE); - } else + if (iomap->flags & IOMAP_F_SHARED) + ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, + kaddr); + else dax_flush(iomap->dax_dev, kaddr + offset, size); return ret; } @@ -1401,8 +1421,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } if (cow) { - ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, - kaddr); + ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, + srcmap, kaddr); if (ret) break; } @@ -1547,7 +1567,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, struct xa_state *xas, void **entry, bool pmd) { const struct iomap *iomap = &iter->iomap; - const struct iomap *srcmap = &iter->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = iter->flags & IOMAP_WRITE; @@ -1578,9 +1598,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { - err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr); + if (write && iomap->flags & IOMAP_F_SHARED) { + err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); if (err) return dax_fault_return(err); } -- cgit v1.2.3 From c6f0b395b2110aa26a134a9a395875b1ec0a5aae Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:54 +0000 Subject: fsdax,xfs: set the shared flag when file extent is shared If a dax page is shared, mapread at different offsets can also trigger page fault on same dax page. So, change the flag from "cow" to "shared". And get the shared flag from filesystem when read. Link: https://lkml.kernel.org/r/1669908538-55-5-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 19 +++++++------------ fs/xfs/xfs_iomap.c | 2 +- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 359b958eb835..fa547ce41add 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -846,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, (iter->iomap.flags & IOMAP_F_DIRTY); } -static bool dax_fault_is_cow(const struct iomap_iter *iter) -{ - return (iter->flags & IOMAP_WRITE) && - (iter->iomap.flags & IOMAP_F_SHARED); -} - /* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to @@ -865,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); - bool dirty = !dax_fault_is_synchronous(iter, vmf->vma); - bool cow = dax_fault_is_cow(iter); + bool write = iter->flags & IOMAP_WRITE; + bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); + bool shared = iter->iomap.flags & IOMAP_F_SHARED; if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { + if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) @@ -883,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, xas_reset(xas); xas_lock_irq(xas); - if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, - cow); + shared); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -908,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, if (dirty) xas_set_mark(xas, PAGECACHE_TAG_DIRTY); - if (cow) + if (write && shared) xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irq(xas); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 07da03976ec1..881de99766ca 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1215,7 +1215,7 @@ xfs_read_iomap_begin( return error; error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); - if (!error && (flags & IOMAP_REPORT)) + if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode))) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); xfs_iunlock(ip, lockmode); -- cgit v1.2.3 From 0e79e3736d54bb8efbc9fb29cc3b54a132783565 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:31:41 +0000 Subject: fsdax: dedupe: iter two files at the same time The iomap_iter() on a range of one file may loop more than once. In this case, the inner dst_iter can update its iomap but the outer src_iter can't. This may cause the wrong remapping in filesystem. Let them called at the same time. Link: https://lkml.kernel.org/r/1669908701-93-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index fa547ce41add..8fb928cd9dce 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1965,15 +1965,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret; + int ret, compared = 0; - while ((ret = iomap_iter(&src_iter, ops)) > 0) { - while ((ret = iomap_iter(&dst_iter, ops)) > 0) { - dst_iter.processed = dax_range_compare_iter(&src_iter, - &dst_iter, len, same); - } - if (ret <= 0) - src_iter.processed = ret; + while ((ret = iomap_iter(&src_iter, ops)) > 0 && + (ret = iomap_iter(&dst_iter, ops)) > 0) { + compared = dax_range_compare_iter(&src_iter, &dst_iter, len, + same); + if (compared < 0) + return ret; + src_iter.processed = dst_iter.processed = compared; } return ret; } -- cgit v1.2.3 From 64e6edc185da7e101e867c4732c097fedb1da08e Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:10 +0000 Subject: xfs: use dax ops for zero and truncate in fsdax mode Zero and truncate on a dax file may execute CoW. So use dax ops which contains end work for CoW. Link: https://lkml.kernel.org/r/1669908730-131-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/xfs/xfs_iomap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 881de99766ca..d9401d0300ad 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1370,7 +1370,7 @@ xfs_zero_range( if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, - &xfs_direct_write_iomap_ops); + &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, &xfs_buffered_write_iomap_ops); } @@ -1385,7 +1385,7 @@ xfs_truncate_page( if (IS_DAX(inode)) return dax_truncate_page(inode, pos, did_zero, - &xfs_direct_write_iomap_ops); + &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, &xfs_buffered_write_iomap_ops); } -- cgit v1.2.3 From d984648e428bf88cbd94ebe346c73632cb92fffb Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:33 +0000 Subject: fsdax,xfs: port unshare to fsdax Implement unshare in fsdax mode: copy data from srcmap to iomap. Link: https://lkml.kernel.org/r/1669908753-169-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_reflink.c | 8 ++++++-- include/linux/dax.h | 2 ++ 3 files changed, 60 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 8fb928cd9dce..c48a3a93ab29 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1245,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ +static s64 dax_unshare_iter(struct iomap_iter *iter) +{ + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); + int id = 0; + s64 ret = 0; + void *daddr = NULL, *saddr = NULL; + + /* don't bother with blocks that are not shared to start with */ + if (!(iomap->flags & IOMAP_F_SHARED)) + return length; + /* don't bother with holes or unwritten extents */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; + + id = dax_read_lock(); + ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = copy_mc_to_kernel(daddr, saddr, length); + if (ret) + ret = -EIO; + +out_unlock: + dax_read_unlock(id); + return ret; +} + +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops) +{ + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = dax_unshare_iter(&iter); + return ret; +} +EXPORT_SYMBOL_GPL(dax_file_unshare); + static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { const struct iomap *iomap = &iter->iomap; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 93bdd25680bc..fe46bce8cae6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1693,8 +1693,12 @@ xfs_reflink_unshare( inode_dio_wait(inode); - error = iomap_file_unshare(inode, offset, len, - &xfs_buffered_write_iomap_ops); + if (IS_DAX(inode)) + error = dax_file_unshare(inode, offset, len, + &xfs_dax_write_iomap_ops); + else + error = iomap_file_unshare(inode, offset, len, + &xfs_buffered_write_iomap_ops); if (error) goto out; diff --git a/include/linux/dax.h b/include/linux/dax.h index ba985333e26b..2b5ecb591059 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -205,6 +205,8 @@ static inline void dax_unlock_mapping_entry(struct address_space *mapping, } #endif +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops); int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, -- cgit v1.2.3 From 480017957d6380d3336a8e80ad90f70415bb86f7 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:53 +0000 Subject: xfs: remove restrictions for fsdax and reflink Since the basic function for fsdax and reflink has been implemented, remove the restrictions of them for widly test. Link: https://lkml.kernel.org/r/1669908773-207-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/xfs/xfs_ioctl.c | 4 ---- fs/xfs/xfs_iops.c | 4 ---- 2 files changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f783e979629..13f1b2add390 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1138,10 +1138,6 @@ xfs_ioctl_setattr_xflags( if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; - /* Don't allow us to set DAX mode for a reflinked file for now. */ - if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) - return -EINVAL; - /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); if (i_flags2 && !xfs_has_v3inodes(mp)) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 2e10e1c66ad6..bf0495f7a5e1 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1185,10 +1185,6 @@ xfs_inode_supports_dax( if (!S_ISREG(VFS_I(ip)->i_mode)) return false; - /* Only supported on non-reflinked files. */ - if (xfs_is_reflink_inode(ip)) - return false; - /* Block size must match page size */ if (mp->m_sb.sb_blocksize != PAGE_SIZE) return false; -- cgit v1.2.3 From a11774122180a782b327b0a9a5091d99c91a4db7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:38 +0100 Subject: extfat: remove ->writepage Patch series "start removing writepage instances v2". The VM doesn't need or want ->writepage for writeback and is fine with just having ->writepages as long as ->migrate_folio is implemented. This series removes all ->writepage instances that use block_write_full_page directly and also have a plain mpage_writepages based ->writepages. This patch (of 7): ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-1-hch@lst.de Link: https://lkml.kernel.org/r/20221202102644.770505-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Namjae Jeon Acked-by: Johannes Weiner Cc: Bob Copeland Cc: Dave Kleikamp Cc: Jan Kara Cc: Mikulas Patocka Cc: OGAWA Hirofumi Cc: Sungjong Seo Signed-off-by: Andrew Morton --- fs/exfat/inode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 5590a1e83126..eac95bcd9a8a 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -345,11 +345,6 @@ static void exfat_readahead(struct readahead_control *rac) mpage_readahead(rac, exfat_get_block); } -static int exfat_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, exfat_get_block, wbc); -} - static int exfat_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -473,12 +468,12 @@ static const struct address_space_operations exfat_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = exfat_read_folio, .readahead = exfat_readahead, - .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, .write_end = exfat_write_end, .direct_IO = exfat_direct_IO, - .bmap = exfat_aop_bmap + .bmap = exfat_aop_bmap, + .migrate_folio = buffer_migrate_folio, }; static inline unsigned long exfat_hash(loff_t i_pos) -- cgit v1.2.3 From ee649af0d9a60ea61a5dad99ef5d6b4aa346f0a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:39 +0100 Subject: fat: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-3-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/fat/inode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 1cbcc4608dc7..d99b8549ec8f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -194,11 +194,6 @@ static int fat_get_block(struct inode *inode, sector_t iblock, return 0; } -static int fat_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, fat_get_block, wbc); -} - static int fat_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -346,12 +341,12 @@ static const struct address_space_operations fat_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = fat_read_folio, .readahead = fat_readahead, - .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, .write_end = fat_write_end, .direct_IO = fat_direct_IO, - .bmap = _fat_bmap + .bmap = _fat_bmap, + .migrate_folio = buffer_migrate_folio, }; /* -- cgit v1.2.3 From ba195d9f14829690b8e4f67549960d83169a314e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:40 +0100 Subject: hfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and stop wiring up ->writepage for hfs_aops. Link: https://lkml.kernel.org/r/20221202102644.770505-4-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/hfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index c4526f16355d..16466a5e88b4 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -173,12 +173,12 @@ const struct address_space_operations hfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfs_read_folio, - .writepage = hfs_writepage, .write_begin = hfs_write_begin, .write_end = generic_write_end, .bmap = hfs_bmap, .direct_IO = hfs_direct_IO, .writepages = hfs_writepages, + .migrate_folio = buffer_migrate_folio, }; /* -- cgit v1.2.3 From 12f9b9a73dc603e658bf24eed2777cecdaf4103e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:41 +0100 Subject: hfsplus: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and stop wiring up ->writepage for hfsplus_aops. Link: https://lkml.kernel.org/r/20221202102644.770505-5-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/hfsplus/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index aeab83ed1c9c..d6572ad2407a 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -170,12 +170,12 @@ const struct address_space_operations hfsplus_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfsplus_read_folio, - .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, .write_end = generic_write_end, .bmap = hfsplus_bmap, .direct_IO = hfsplus_direct_IO, .writepages = hfsplus_writepages, + .migrate_folio = buffer_migrate_folio, }; const struct dentry_operations hfsplus_dentry_operations = { -- cgit v1.2.3 From cd2e6024260de27a523e0af6ee47a20a6b8b8aa8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:42 +0100 Subject: hpfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-6-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/hpfs/file.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index f7547a62c81f..88952d4a631e 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -163,11 +163,6 @@ static int hpfs_read_folio(struct file *file, struct folio *folio) return mpage_read_folio(folio, hpfs_get_block); } -static int hpfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, hpfs_get_block, wbc); -} - static void hpfs_readahead(struct readahead_control *rac) { mpage_readahead(rac, hpfs_get_block); @@ -248,12 +243,12 @@ const struct address_space_operations hpfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hpfs_read_folio, - .writepage = hpfs_writepage, .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, - .bmap = _hpfs_bmap + .bmap = _hpfs_bmap, + .migrate_folio = buffer_migrate_folio, }; const struct file_operations hpfs_file_ops = -- cgit v1.2.3 From 2274c3b281bb47e6980ae42fb8dc93b7a38192d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:43 +0100 Subject: jfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Dave Kleikamp Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/jfs/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index d1ec920aa030..8ac10e396050 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -264,11 +264,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock, return rc; } -static int jfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, jfs_get_block, wbc); -} - static int jfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -355,12 +350,12 @@ const struct address_space_operations jfs_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = jfs_read_folio, .readahead = jfs_readahead, - .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, .write_end = jfs_write_end, .bmap = jfs_bmap, .direct_IO = jfs_direct_IO, + .migrate_folio = buffer_migrate_folio, }; /* -- cgit v1.2.3 From 1bda9dad5aa0199c8592bac32b91afbf8ea236ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:44 +0100 Subject: omfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Bob Copeland Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/omfs/file.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/omfs/file.c b/fs/omfs/file.c index fa7fe2393ff6..3a5b4b88a583 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -294,11 +294,6 @@ static void omfs_readahead(struct readahead_control *rac) mpage_readahead(rac, omfs_get_block); } -static int omfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, omfs_get_block, wbc); -} - static int omfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -375,10 +370,10 @@ const struct address_space_operations omfs_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = omfs_read_folio, .readahead = omfs_readahead, - .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, .write_end = generic_write_end, .bmap = omfs_bmap, + .migrate_folio = buffer_migrate_folio, }; -- cgit v1.2.3 From 8614d6c5eda005ad72b37afeaae2879d7c101b18 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 5 Dec 2022 18:30:07 +0100 Subject: mm: do not show fs mm pc for VM_LOCKONFAULT pages When VM_LOCKONFAULT was added, /proc/PID/smaps wasn't hooked up to it, so looking at /proc/PID/smaps, it shows '??' instead of something intelligable. This can be reached by userspace by simply calling `mlock2(..., MLOCK_ONFAULT);`. Fix this by adding "lf" to denote VM_LOCKONFAULT. Link: https://lkml.kernel.org/r/20221205173007.580210-1-Jason@zx2c4.com Fixes: de60f5f10c58 ("mm: introduce VM_LOCKONFAULT") Signed-off-by: Jason A. Donenfeld Acked-by: Vlastimil Babka Cc: Eric B Munson Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 89338950afd3..e35a0398db63 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -674,6 +674,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", -- cgit v1.2.3