From 0069455bcbf9ea73ffe4553ed6d2b4e4cad703de Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Mar 2024 09:36:23 -0700 Subject: fix missing vmalloc.h includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Memory allocation profiling", v6. Overview: Low overhead [1] per-callsite memory allocation profiling. Not just for debug kernels, overhead low enough to be deployed in production. Example output: root@moria-kvm:~# sort -rn /proc/allocinfo 127664128 31168 mm/page_ext.c:270 func:alloc_page_ext 56373248 4737 mm/slub.c:2259 func:alloc_slab_page 14880768 3633 mm/readahead.c:247 func:page_cache_ra_unbounded 14417920 3520 mm/mm_init.c:2530 func:alloc_large_system_hash 13377536 234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs 11718656 2861 mm/filemap.c:1919 func:__filemap_get_folio 9192960 2800 kernel/fork.c:307 func:alloc_thread_stack_node 4206592 4 net/netfilter/nf_conntrack_core.c:2567 func:nf_ct_alloc_hashtable 4136960 1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod] func:ctagmod_start 3940352 962 mm/memory.c:4214 func:alloc_anon_folio 2894464 22613 fs/kernfs/dir.c:615 func:__kernfs_new_node ... Usage: kconfig options: - CONFIG_MEM_ALLOC_PROFILING - CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT - CONFIG_MEM_ALLOC_PROFILING_DEBUG adds warnings for allocations that weren't accounted because of a missing annotation sysctl: /proc/sys/vm/mem_profiling Runtime info: /proc/allocinfo Notes: [1]: Overhead To measure the overhead we are comparing the following configurations: (1) Baseline with CONFIG_MEMCG_KMEM=n (2) Disabled by default (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=n) (3) Enabled by default (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=y) (4) Enabled at runtime (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=n && /proc/sys/vm/mem_profiling=1) (5) Baseline with CONFIG_MEMCG_KMEM=y && allocating with __GFP_ACCOUNT (6) Disabled by default (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=n) && CONFIG_MEMCG_KMEM=y (7) Enabled by default (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=y) && CONFIG_MEMCG_KMEM=y Performance overhead: To evaluate performance we implemented an in-kernel test executing multiple get_free_page/free_page and kmalloc/kfree calls with allocation sizes growing from 8 to 240 bytes with CPU frequency set to max and CPU affinity set to a specific CPU to minimize the noise. Below are results from running the test on Ubuntu 22.04.2 LTS with 6.8.0-rc1 kernel on 56 core Intel Xeon: kmalloc pgalloc (1 baseline) 6.764s 16.902s (2 default disabled) 6.793s (+0.43%) 17.007s (+0.62%) (3 default enabled) 7.197s (+6.40%) 23.666s (+40.02%) (4 runtime enabled) 7.405s (+9.48%) 23.901s (+41.41%) (5 memcg) 13.388s (+97.94%) 48.460s (+186.71%) (6 def disabled+memcg) 13.332s (+97.10%) 48.105s (+184.61%) (7 def enabled+memcg) 13.446s (+98.78%) 54.963s (+225.18%) Memory overhead: Kernel size: text data bss dec diff (1) 26515311 18890222 17018880 62424413 (2) 26524728 19423818 16740352 62688898 264485 (3) 26524724 19423818 16740352 62688894 264481 (4) 26524728 19423818 16740352 62688898 264485 (5) 26541782 18964374 16957440 62463596 39183 Memory consumption on a 56 core Intel CPU with 125GB of memory: Code tags: 192 kB PageExts: 262144 kB (256MB) SlabExts: 9876 kB (9.6MB) PcpuExts: 512 kB (0.5MB) Total overhead is 0.2% of total memory. Benchmarks: Hackbench tests run 100 times: hackbench -s 512 -l 200 -g 15 -f 25 -P baseline disabled profiling enabled profiling avg 0.3543 0.3559 (+0.0016) 0.3566 (+0.0023) stdev 0.0137 0.0188 0.0077 hackbench -l 10000 baseline disabled profiling enabled profiling avg 6.4218 6.4306 (+0.0088) 6.5077 (+0.0859) stdev 0.0933 0.0286 0.0489 stress-ng tests: stress-ng --class memory --seq 4 -t 60 stress-ng --class cpu --seq 4 -t 60 Results posted at: https://evilpiepirate.org/~kent/memalloc_prof_v4_stress-ng/ [2] https://lore.kernel.org/all/20240306182440.2003814-1-surenb@google.com/ This patch (of 37): The next patch drops vmalloc.h from a system header in order to fix a circular dependency; this adds it to all the files that were pulling it in implicitly. [kent.overstreet@linux.dev: fix arch/alpha/lib/memcpy.c] Link: https://lkml.kernel.org/r/20240327002152.3339937-1-kent.overstreet@linux.dev [surenb@google.com: fix arch/x86/mm/numa_32.c] Link: https://lkml.kernel.org/r/20240402180933.1663992-1-surenb@google.com [kent.overstreet@linux.dev: a few places were depending on sizes.h] Link: https://lkml.kernel.org/r/20240404034744.1664840-1-kent.overstreet@linux.dev [arnd@arndb.de: fix mm/kasan/hw_tags.c] Link: https://lkml.kernel.org/r/20240404124435.3121534-1-arnd@kernel.org [surenb@google.com: fix arc build] Link: https://lkml.kernel.org/r/20240405225115.431056-1-surenb@google.com Link: https://lkml.kernel.org/r/20240321163705.3067592-1-surenb@google.com Link: https://lkml.kernel.org/r/20240321163705.3067592-2-surenb@google.com Signed-off-by: Kent Overstreet Signed-off-by: Suren Baghdasaryan Signed-off-by: Arnd Bergmann Reviewed-by: Pasha Tatashin Tested-by: Kees Cook Cc: Alexander Viro Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Miguel Ojeda Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- drivers/vfio/pci/pds/dirty.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index 68e8f006dfdb..c51f5e4c3dd6 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -3,6 +3,7 @@ #include #include +#include #include #include -- cgit v1.2.3 From 29ae7d96d166fa08c7232daf8a314ef5ba1efd20 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 10 Apr 2024 17:55:26 +0200 Subject: mm: pass VMA instead of MM to follow_pte() ... and centralize the VM_IO/VM_PFNMAP sanity check in there. We'll now also perform these sanity checks for direct follow_pte() invocations. For generic_access_phys(), we might now check multiple times: nothing to worry about, really. Link: https://lkml.kernel.org/r/20240410155527.474777-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Sean Christopherson [KVM] Cc: Alex Williamson Cc: Christoph Hellwig Cc: Fei Li Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Ingo Molnar Cc: Paolo Bonzini Cc: Yonghua Huang Signed-off-by: Andrew Morton --- arch/s390/pci/pci_mmio.c | 4 ++-- arch/x86/mm/pat/memtype.c | 5 +---- drivers/vfio/vfio_iommu_type1.c | 4 ++-- drivers/virt/acrn/mm.c | 3 +-- include/linux/mm.h | 2 +- mm/memory.c | 15 ++++++++------- virt/kvm/kvm_main.c | 4 ++-- 7 files changed, 17 insertions(+), 20 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index a90499c087f0..5398729bfe1b 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -169,7 +169,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + ret = follow_pte(vma, mmio_addr, &ptep, &ptl); if (ret) goto out_unlock_mmap; @@ -308,7 +308,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + ret = follow_pte(vma, mmio_addr, &ptep, &ptl); if (ret) goto out_unlock_mmap; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index d01c3b0bd6eb..bdc2a240c2aa 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -954,10 +954,7 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, pte_t *ptep, pte; spinlock_t *ptl; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - - if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl)) + if (follow_pte(vma, vma->vm_start, &ptep, &ptl)) return -EINVAL; pte = ptep_get(ptep); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index b5c15fe8f9fc..3a0218171cfa 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -518,7 +518,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, spinlock_t *ptl; int ret; - ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); + ret = follow_pte(vma, vaddr, &ptep, &ptl); if (ret) { bool unlocked = false; @@ -532,7 +532,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, if (ret) return ret; - ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); + ret = follow_pte(vma, vaddr, &ptep, &ptl); if (ret) return ret; } diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c index 2d98e1e185c4..db8ff1d0ac23 100644 --- a/drivers/virt/acrn/mm.c +++ b/drivers/virt/acrn/mm.c @@ -187,8 +187,7 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap) } for (i = 0; i < nr_pages; i++) { - ret = follow_pte(vma->vm_mm, - memmap->vma_base + i * PAGE_SIZE, + ret = follow_pte(vma, memmap->vma_base + i * PAGE_SIZE, &ptep, &ptl); if (ret) break; diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b933f9f4864..c25ce6992951 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2420,7 +2420,7 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_pte(struct mm_struct *mm, unsigned long address, +int follow_pte(struct vm_area_struct *vma, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); diff --git a/mm/memory.c b/mm/memory.c index 59c05dc8b18a..1d45d25c1bba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5926,7 +5926,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) /** * follow_pte - look up PTE at a user virtual address - * @mm: the mm_struct of the target address space + * @vma: the memory mapping * @address: user virtual address * @ptepp: location to store found PTE * @ptlp: location to store the lock for the PTE @@ -5945,15 +5945,19 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) * * Return: zero on success, -ve otherwise. */ -int follow_pte(struct mm_struct *mm, unsigned long address, +int follow_pte(struct vm_area_struct *vma, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; @@ -6007,11 +6011,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, int offset = offset_in_page(addr); int ret = -EINVAL; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - retry: - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) return -EINVAL; pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); @@ -6026,7 +6027,7 @@ retry: if (!maddr) return -ENOMEM; - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) goto out_unmap; if (!pte_same(pte, ptep_get(ptep))) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fb49c2a60200..f57dbacb8689 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2902,7 +2902,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, spinlock_t *ptl; int r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + r = follow_pte(vma, addr, &ptep, &ptl); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does @@ -2917,7 +2917,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, if (r) return r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + r = follow_pte(vma, addr, &ptep, &ptl); if (r) return r; } -- cgit v1.2.3