From e0850113937b843c69b50b5d9087978ae4254be7 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:55 -0800 Subject: RDMA/siw: make use of the helper function kthread_run_on_cpu() Replace kthread_create/kthread_bind/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-3-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/sw/siw/siw_main.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 9093e6a80b26..e5c586913d0b 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -98,15 +98,14 @@ static int siw_create_tx_threads(void) continue; siw_tx_thread[cpu] = - kthread_create(siw_run_sq, (unsigned long *)(long)cpu, - "siw_tx/%d", cpu); + kthread_run_on_cpu(siw_run_sq, + (unsigned long *)(long)cpu, + cpu, "siw_tx/%u"); if (IS_ERR(siw_tx_thread[cpu])) { siw_tx_thread[cpu] = NULL; continue; } - kthread_bind(siw_tx_thread[cpu], cpu); - wake_up_process(siw_tx_thread[cpu]); assigned++; } return assigned; -- cgit v1.2.3 From 972fa3a7c17c9d60212e32ecc0205dc585b1e769 Mon Sep 17 00:00:00 2001 From: Calvin Zhang Date: Fri, 14 Jan 2022 14:04:08 -0800 Subject: mm: kmemleak: alloc gray object for reserved region with direct map Reserved regions with direct mapping may contain references to other regions. CMA region with fixed location is reserved without creating kmemleak_object for it. So add them as gray kmemleak objects. Link: https://lkml.kernel.org/r/20211123090641.3654006-1-calvinzhang.cool@gmail.com Signed-off-by: Calvin Zhang Cc: Rob Herring Cc: Frank Rowand Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/fdt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index bdca35284ceb..116c582fea7a 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -26,6 +26,7 @@ #include #include #include +#include #include /* for COMMAND_LINE_SIZE */ #include @@ -522,9 +523,12 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, size = dt_mem_next_cell(dt_root_size_cells, &prop); if (size && - early_init_dt_reserve_memory_arch(base, size, nomap) == 0) + early_init_dt_reserve_memory_arch(base, size, nomap) == 0) { pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); + if (!nomap) + kmemleak_alloc_phys(base, size, 0, 0); + } else pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); -- cgit v1.2.3 From b9b5777f09be84d0de472ded2253d2f5101427f2 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:26 -0800 Subject: device-dax: use ALIGN() for determining pgoff Rather than calculating @pgoff manually, switch to ALIGN() instead. Link: https://lkml.kernel.org/r/20211202204422.26777-6-joao.m.martins@oracle.com Suggested-by: Dan Williams Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index dd8222a42808..0b82159b3564 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -234,8 +234,8 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, * mapped. No need to consider the zero page, or racing * conflicting mappings. */ - pgoff = linear_page_index(vmf->vma, vmf->address - & ~(fault_size - 1)); + pgoff = linear_page_index(vmf->vma, + ALIGN(vmf->address, fault_size)); for (i = 0; i < fault_size / PAGE_SIZE; i++) { struct page *page; -- cgit v1.2.3 From 09b80137033dbc5f1d197e99116527c0f8d253f2 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:29 -0800 Subject: device-dax: use struct_size() Use the struct_size() helper for the size of a struct with variable array member at the end, rather than manually calculating it. Link: https://lkml.kernel.org/r/20211202204422.26777-7-joao.m.martins@oracle.com Suggested-by: Dan Williams Signed-off-by: Joao Martins Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 0b82159b3564..038816b91af6 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -404,8 +404,9 @@ int dev_dax_probe(struct dev_dax *dev_dax) return -EINVAL; if (!pgmap) { - pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range) - * (dev_dax->nr_range - 1), GFP_KERNEL); + pgmap = devm_kzalloc(dev, + struct_size(pgmap, ranges, dev_dax->nr_range - 1), + GFP_KERNEL); if (!pgmap) return -ENOMEM; pgmap->nr_range = dev_dax->nr_range; -- cgit v1.2.3 From fc65c4eb0b2a27c30d35636650e3f4ddb07506cd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:33 -0800 Subject: device-dax: ensure dev_dax->pgmap is valid for dynamic devices Right now, only static dax regions have a valid @pgmap pointer in its struct dev_dax. Dynamic dax case however, do not. In preparation for device-dax compound devmap support, make sure that dev_dax pgmap field is set after it has been allocated and initialized. dynamic dax device have the @pgmap is allocated at probe() and it's managed by devm (contrast to static dax region which a pgmap is provided and dax core kfrees it). So in addition to ensure a valid @pgmap, clear the pgmap when the dynamic dax device is released to avoid the same pgmap ranges to be re-requested across multiple region device reconfigs. Add a static_dev_dax() and use that helper in dev_dax_probe() to ensure the initialization differences between dynamic and static regions are more explicit. While at it, consolidate the ranges initialization when we allocate the @pgmap for the dynamic dax region case. Also take the opportunity to document the differences between static and dynamic da regions. Link: https://lkml.kernel.org/r/20211202204422.26777-8-joao.m.martins@oracle.com Suggested-by: Dan Williams Signed-off-by: Joao Martins Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/bus.c | 32 ++++++++++++++++++++++++++++++++ drivers/dax/bus.h | 1 + drivers/dax/device.c | 29 +++++++++++++++++++++-------- 3 files changed, 54 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 6cc4da4c713d..a22350e822fa 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -129,11 +129,35 @@ ATTRIBUTE_GROUPS(dax_drv); static int dax_bus_match(struct device *dev, struct device_driver *drv); +/* + * Static dax regions are regions created by an external subsystem + * nvdimm where a single range is assigned. Its boundaries are by the external + * subsystem and are usually limited to one physical memory range. For example, + * for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a + * single contiguous range) + * + * On dynamic dax regions, the assigned region can be partitioned by dax core + * into multiple subdivisions. A subdivision is represented into one + * /dev/daxN.M device composed by one or more potentially discontiguous ranges. + * + * When allocating a dax region, drivers must set whether it's static + * (IORESOURCE_DAX_STATIC). On static dax devices, the @pgmap is pre-assigned + * to dax core when calling devm_create_dev_dax(), whereas in dynamic dax + * devices it is NULL but afterwards allocated by dax core on device ->probe(). + * Care is needed to make sure that dynamic dax devices are torn down with a + * cleared @pgmap field (see kill_dev_dax()). + */ static bool is_static(struct dax_region *dax_region) { return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0; } +bool static_dev_dax(struct dev_dax *dev_dax) +{ + return is_static(dev_dax->region); +} +EXPORT_SYMBOL_GPL(static_dev_dax); + static u64 dev_dax_size(struct dev_dax *dev_dax) { u64 size = 0; @@ -363,6 +387,14 @@ void kill_dev_dax(struct dev_dax *dev_dax) kill_dax(dax_dev); unmap_mapping_range(inode->i_mapping, 0, 0, 1); + + /* + * Dynamic dax region have the pgmap allocated via dev_kzalloc() + * and thus freed by devm. Clear the pgmap to not have stale pgmap + * ranges on probe() from previous reconfigurations of region devices. + */ + if (!static_dev_dax(dev_dax)) + dev_dax->pgmap = NULL; } EXPORT_SYMBOL_GPL(kill_dev_dax); diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 1e946ad7780a..4acdfee7dd59 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -48,6 +48,7 @@ int __dax_driver_register(struct dax_device_driver *dax_drv, __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); +bool static_dev_dax(struct dev_dax *dev_dax); #if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) int dev_dax_probe(struct dev_dax *dev_dax); diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 038816b91af6..630de5a795b0 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -398,18 +398,34 @@ int dev_dax_probe(struct dev_dax *dev_dax) void *addr; int rc, i; - pgmap = dev_dax->pgmap; - if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1, - "static pgmap / multi-range device conflict\n")) - return -EINVAL; + if (static_dev_dax(dev_dax)) { + if (dev_dax->nr_range > 1) { + dev_warn(dev, + "static pgmap / multi-range device conflict\n"); + return -EINVAL; + } + + pgmap = dev_dax->pgmap; + } else { + if (dev_dax->pgmap) { + dev_warn(dev, + "dynamic-dax with pre-populated page map\n"); + return -EINVAL; + } - if (!pgmap) { pgmap = devm_kzalloc(dev, struct_size(pgmap, ranges, dev_dax->nr_range - 1), GFP_KERNEL); if (!pgmap) return -ENOMEM; + pgmap->nr_range = dev_dax->nr_range; + dev_dax->pgmap = pgmap; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + pgmap->ranges[i] = *range; + } } for (i = 0; i < dev_dax->nr_range; i++) { @@ -421,9 +437,6 @@ int dev_dax_probe(struct dev_dax *dev_dax) i, range->start, range->end); return -EBUSY; } - /* don't update the range for static pgmap */ - if (!dev_dax->pgmap) - pgmap->ranges[i] = *range; } pgmap->type = MEMORY_DEVICE_GENERIC; -- cgit v1.2.3 From a0fb038e50d72f8e60731dc48fb83a3a141b822e Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:36 -0800 Subject: device-dax: factor out page mapping initialization Move initialization of page->mapping into a separate helper. This is in preparation to move the mapping set to be prior to inserting the page table entry and also for tidying up compound page handling into one helper. Link: https://lkml.kernel.org/r/20211202204422.26777-9-joao.m.martins@oracle.com Signed-off-by: Joao Martins Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 630de5a795b0..9c87927d4bc2 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -73,6 +73,27 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, return -1; } +static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, + unsigned long fault_size) +{ + unsigned long i, nr_pages = fault_size / PAGE_SIZE; + struct file *filp = vmf->vma->vm_file; + pgoff_t pgoff; + + pgoff = linear_page_index(vmf->vma, + ALIGN(vmf->address, fault_size)); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + + if (page->mapping) + continue; + + page->mapping = filp->f_mapping; + page->index = pgoff + i; + } +} + static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf, pfn_t *pfn) { @@ -224,28 +245,8 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, rc = VM_FAULT_SIGBUS; } - if (rc == VM_FAULT_NOPAGE) { - unsigned long i; - pgoff_t pgoff; - - /* - * In the device-dax case the only possibility for a - * VM_FAULT_NOPAGE result is when device-dax capacity is - * mapped. No need to consider the zero page, or racing - * conflicting mappings. - */ - pgoff = linear_page_index(vmf->vma, - ALIGN(vmf->address, fault_size)); - for (i = 0; i < fault_size / PAGE_SIZE; i++) { - struct page *page; - - page = pfn_to_page(pfn_t_to_pfn(pfn) + i); - if (page->mapping) - continue; - page->mapping = filp->f_mapping; - page->index = pgoff + i; - } - } + if (rc == VM_FAULT_NOPAGE) + dax_set_mapping(vmf, pfn, fault_size); dax_read_unlock(id); return rc; -- cgit v1.2.3 From 0e7325f03f09802d1667b8860e10fe39c25bf14c Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:40 -0800 Subject: device-dax: set mapping prior to vmf_insert_pfn{,_pmd,pud}() Normally, the @page mapping is set prior to inserting the page into a page table entry. Make device-dax adhere to the same ordering, rather than setting mapping after the PTE is inserted. The address_space never changes and it is always associated with the same inode and underlying pages. So, the page mapping is set once but cleared when the struct pages are removed/freed (i.e. after {devm_}memunmap_pages()). Link: https://lkml.kernel.org/r/20211202204422.26777-10-joao.m.martins@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Joao Martins Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 9c87927d4bc2..19a6b86486ce 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -121,6 +121,8 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + dax_set_mapping(vmf, *pfn, fault_size); + return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); } @@ -161,6 +163,8 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + dax_set_mapping(vmf, *pfn, fault_size); + return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } @@ -203,6 +207,8 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + dax_set_mapping(vmf, *pfn, fault_size); + return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } #else @@ -217,7 +223,6 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { struct file *filp = vmf->vma->vm_file; - unsigned long fault_size; vm_fault_t rc = VM_FAULT_SIGBUS; int id; pfn_t pfn; @@ -230,23 +235,18 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, id = dax_read_lock(); switch (pe_size) { case PE_SIZE_PTE: - fault_size = PAGE_SIZE; rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); break; case PE_SIZE_PMD: - fault_size = PMD_SIZE; rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); break; case PE_SIZE_PUD: - fault_size = PUD_SIZE; rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); break; default: rc = VM_FAULT_SIGBUS; } - if (rc == VM_FAULT_NOPAGE) - dax_set_mapping(vmf, pfn, fault_size); dax_read_unlock(id); return rc; -- cgit v1.2.3 From 6ec228b6fef5ad3a1f19e76c29640a9161415240 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:43 -0800 Subject: device-dax: remove pfn from __dev_dax_{pte,pmd,pud}_fault() After moving the page mapping to be set prior to pte insertion, the pfn in dev_dax_huge_fault() no longer is necessary. Remove it, as well as the @pfn argument passed to the internal fault handler helpers. [akpm@linux-foundation.org: fix CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=n build] Link: https://lkml.kernel.org/r/20211202204422.26777-11-joao.m.martins@oracle.com Signed-off-by: Joao Martins Suggested-by: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 19a6b86486ce..60d43cb195da 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -95,10 +95,11 @@ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, } static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { struct device *dev = &dev_dax->dev; phys_addr_t phys; + pfn_t pfn; unsigned int fault_size = PAGE_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) @@ -119,20 +120,21 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); - dax_set_mapping(vmf, *pfn, fault_size); + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); } static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dev_dax->dev; phys_addr_t phys; pgoff_t pgoff; + pfn_t pfn; unsigned int fault_size = PMD_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) @@ -161,21 +163,22 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); - dax_set_mapping(vmf, *pfn, fault_size); + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pud_addr = vmf->address & PUD_MASK; struct device *dev = &dev_dax->dev; phys_addr_t phys; pgoff_t pgoff; + pfn_t pfn; unsigned int fault_size = PUD_SIZE; @@ -205,15 +208,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); - dax_set_mapping(vmf, *pfn, fault_size); + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); } #else static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { return VM_FAULT_FALLBACK; } @@ -225,7 +228,6 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, struct file *filp = vmf->vma->vm_file; vm_fault_t rc = VM_FAULT_SIGBUS; int id; - pfn_t pfn; struct dev_dax *dev_dax = filp->private_data; dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, @@ -235,13 +237,13 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, id = dax_read_lock(); switch (pe_size) { case PE_SIZE_PTE: - rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pte_fault(dev_dax, vmf); break; case PE_SIZE_PMD: - rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pmd_fault(dev_dax, vmf); break; case PE_SIZE_PUD: - rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pud_fault(dev_dax, vmf); break; default: rc = VM_FAULT_SIGBUS; -- cgit v1.2.3 From 14606001efb48a17be31a5bec626c13ca49d783a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:47 -0800 Subject: device-dax: compound devmap support Use the newly added compound devmap facility which maps the assigned dax ranges as compound pages at a page size of @align. dax devices are created with a fixed @align (huge page size) which is enforced through as well at mmap() of the device. Faults, consequently happen too at the specified @align specified at the creation, and those don't change throughout dax device lifetime. MCEs unmap a whole dax huge page, as well as splits occurring at the configured page size. Performance measured by gup_test improves considerably for unpin_user_pages() and altmap with NVDIMMs: $ gup_test -f /dev/dax1.0 -m 16384 -r 10 -S -a -n 512 -w (pin_user_pages_fast 2M pages) put:~71 ms -> put:~22 ms [altmap] (pin_user_pages_fast 2M pages) get:~524ms put:~525 ms -> get: ~127ms put:~71ms $ gup_test -f /dev/dax1.0 -m 129022 -r 10 -S -a -n 512 -w (pin_user_pages_fast 2M pages) put:~513 ms -> put:~188 ms [altmap with -m 127004] (pin_user_pages_fast 2M pages) get:~4.1 secs put:~4.12 secs -> get:~1sec put:~563ms .. as well as unpin_user_page_range_dirty_lock() being just as effective as THP/hugetlb[0] pages. [0] https://lore.kernel.org/linux-mm/20210212130843.13865-5-joao.m.martins@oracle.com/ Link: https://lkml.kernel.org/r/20211202204422.26777-12-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 60d43cb195da..591f293d326f 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -78,14 +78,20 @@ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, { unsigned long i, nr_pages = fault_size / PAGE_SIZE; struct file *filp = vmf->vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; pgoff_t pgoff; + /* mapping is only set on the head */ + if (dev_dax->pgmap->vmemmap_shift) + nr_pages = 1; + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); for (i = 0; i < nr_pages; i++) { struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + page = compound_head(page); if (page->mapping) continue; @@ -443,6 +449,9 @@ int dev_dax_probe(struct dev_dax *dev_dax) } pgmap->type = MEMORY_DEVICE_GENERIC; + if (dev_dax->align > PAGE_SIZE) + pgmap->vmemmap_shift = + order_base_2(dev_dax->align >> PAGE_SHIFT); addr = devm_memremap_pages(dev, pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); -- cgit v1.2.3 From 7f0d267243aa9dd32944bd7d3b34afff60545edb Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 14 Jan 2022 14:09:22 -0800 Subject: zram: use ATTRIBUTE_GROUPS Embrace ATTRIBUTE_GROUPS to avoid boiler plate code. This should not introduce any functional changes. Link: https://lkml.kernel.org/r/20211028203600.2157356-1-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Reviewed-by: Bart Van Assche Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 25071126995b..9a46b2ef6951 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1903,14 +1903,7 @@ static struct attribute *zram_disk_attrs[] = { NULL, }; -static const struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, -}; - -static const struct attribute_group *zram_disk_attr_groups[] = { - &zram_disk_attr_group, - NULL, -}; +ATTRIBUTE_GROUPS(zram_disk); /* * Allocate and initialize new zram device. the function returns @@ -1982,7 +1975,7 @@ static int zram_add(void) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); - ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups); + ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; -- cgit v1.2.3