From 54d020692b342f7bd02d7f5795fb5c401caecfcc Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:33 +0100 Subject: mm/gup: remove unused vmas parameter from get_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "remove the vmas parameter from GUP APIs", v6. (pin_/get)_user_pages[_remote]() each provide an optional output parameter for an array of VMA objects associated with each page in the input range. These provide the means for VMAs to be returned, as long as mm->mmap_lock is never released during the GUP operation (i.e. the internal flag FOLL_UNLOCKABLE is not specified). In addition, these VMAs can only be accessed with the mmap_lock held and become invalidated the moment it is released. The vast majority of invocations do not use this functionality and of those that do, all but one case retrieve a single VMA to perform checks upon. It is not egregious in the single VMA cases to simply replace the operation with a vma_lookup(). In these cases we duplicate the (fast) lookup on a slow path already under the mmap_lock, abstracted to a new get_user_page_vma_remote() inline helper function which also performs error checking and reference count maintenance. The special case is io_uring, where io_pin_pages() specifically needs to assert that the VMAs underlying the range do not result in broken long-term GUP file-backed mappings. As GUP now internally asserts that FOLL_LONGTERM mappings are not file-backed in a broken fashion (i.e. requiring dirty tracking) - as implemented in "mm/gup: disallow FOLL_LONGTERM GUP-nonfast writing to file-backed mappings" - this logic is no longer required and so we can simply remove it altogether from io_uring. Eliminating the vmas parameter eliminates an entire class of danging pointer errors that might have occured should the lock have been incorrectly released. In addition, the API is simplified and now clearly expresses what it is intended for - applying the specified GUP flags and (if pinning) returning pinned pages. This change additionally opens the door to further potential improvements in GUP and the possible marrying of disparate code paths. I have run this series against gup_test with no issues. Thanks to Matthew Wilcox for suggesting this refactoring! This patch (of 6): No invocation of get_user_pages() use the vmas parameter, so remove it. The GUP API is confusing and caveated. Recent changes have done much to improve that, however there is more we can do. Exporting vmas is a prime target as the caller has to be extremely careful to preclude their use after the mmap_lock has expired or otherwise be left with dangling pointers. Removing the vmas parameter focuses the GUP functions upon their primary purpose - pinning (and outputting) pages as well as performing the actions implied by the input flags. This is part of a patch series aiming to remove the vmas parameter altogether. Link: https://lkml.kernel.org/r/cover.1684350871.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/589e0c64794668ffc799651e8d85e703262b1e9d.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Matthew Wilcox (Oracle) Acked-by: Greg Kroah-Hartman Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Acked-by: Christian König (for radeon parts) Acked-by: Jarkko Sakkinen Reviewed-by: Christoph Hellwig Acked-by: Sean Christopherson (KVM) Cc: Catalin Marinas Cc: Dennis Dalessandro Cc: Janosch Frank Cc: Jens Axboe Cc: Sakari Ailus Signed-off-by: Andrew Morton --- drivers/gpu/drm/radeon/radeon_ttm.c | 2 +- drivers/misc/sgi-gru/grufault.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 2220cdf6a3f6..3a9db030f98f 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -359,7 +359,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_device *bdev, struct ttm_tt *ttm struct page **pages = ttm->pages + pinned; r = get_user_pages(userptr, num_pages, write ? FOLL_WRITE : 0, - pages, NULL); + pages); if (r < 0) goto release_pages; diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index b836936e9747..378cf02a2aa1 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -185,7 +185,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma, #else *pageshift = PAGE_SHIFT; #endif - if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0) + if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page) <= 0) return -EFAULT; *paddr = page_to_phys(page); put_page(page); -- cgit v1.2.3 From 0b295316b3a9b7858eafbebdc31b4827a6edde03 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:36 +0100 Subject: mm/gup: remove unused vmas parameter from pin_user_pages_remote() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No invocation of pin_user_pages_remote() uses the vmas parameter, so remove it. This forms part of a larger patch set eliminating the use of the vmas parameters altogether. Link: https://lkml.kernel.org/r/28f000beb81e45bf538a2aaa77c90f5482b67a32.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Cc: Catalin Marinas Cc: Christian König Cc: Dennis Dalessandro Cc: Greg Kroah-Hartman Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Cc: Sakari Ailus Cc: Sean Christopherson Signed-off-by: Andrew Morton --- drivers/iommu/iommufd/pages.c | 4 ++-- drivers/vfio/vfio_iommu_type1.c | 2 +- include/linux/mm.h | 2 +- kernel/trace/trace_events_user.c | 2 +- mm/gup.c | 8 +++----- mm/process_vm_access.c | 2 +- 6 files changed, 9 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 3c47846cc5ef..412ca96be128 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -786,7 +786,7 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, user->locked = 1; } rc = pin_user_pages_remote(pages->source_mm, uptr, npages, - user->gup_flags, user->upages, NULL, + user->gup_flags, user->upages, &user->locked); } if (rc <= 0) { @@ -1799,7 +1799,7 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, rc = pin_user_pages_remote( pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, - NULL, NULL); + NULL); mmap_read_unlock(pages->source_mm); if (rc != 1) { if (WARN_ON(rc >= 0)) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0d2f805468e1..306e6f1d1c70 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -562,7 +562,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, mmap_read_lock(mm); ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, - pages, NULL, NULL); + pages, NULL); if (ret > 0) { int i; diff --git a/include/linux/mm.h b/include/linux/mm.h index 6336253c18e2..cf17ffdf4fbf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2367,7 +2367,7 @@ long get_user_pages_remote(struct mm_struct *mm, long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked); + int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b1ecd7677642..bdc2666e8d39 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -406,7 +406,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, return -EBUSY; ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, - &page, NULL, NULL); + &page, NULL); if (unlikely(ret <= 0)) { if (!fixup_fault) diff --git a/mm/gup.c b/mm/gup.c index 21daeee5f163..edf0fe2695b0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3100,8 +3100,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. @@ -3116,14 +3114,14 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { int local_locked = 1; - if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + if (!is_valid_gup_args(pages, NULL, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; - return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, + return __gup_longterm_locked(mm, start, nr_pages, pages, NULL, locked ? locked : &local_locked, gup_flags); } diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 78dfaf9e8990..0523edab03a6 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -104,7 +104,7 @@ static int process_vm_rw_single_vec(unsigned long addr, mmap_read_lock(mm); pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, - NULL, &locked); + &locked); if (locked) mmap_read_unlock(mm); if (pinned_pages <= 0) -- cgit v1.2.3 From 4c630f307455c06f99bdeca7f7a1ab5318604fe0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:45 +0100 Subject: mm/gup: remove vmas parameter from pin_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are now in a position where no caller of pin_user_pages() requires the vmas parameter at all, so eliminate this parameter from the function and all callers. This clears the way to removing the vmas parameter from GUP altogether. Link: https://lkml.kernel.org/r/195a99ae949c9f5cb589d2222b736ced96ec199a.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Dennis Dalessandro [qib] Reviewed-by: Christoph Hellwig Acked-by: Sakari Ailus [drivers/media] Cc: Catalin Marinas Cc: Christian König Cc: Greg Kroah-Hartman Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Cc: Sean Christopherson Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/iommu_api.c | 2 +- drivers/infiniband/hw/qib/qib_user_pages.c | 2 +- drivers/infiniband/hw/usnic/usnic_uiom.c | 2 +- drivers/infiniband/sw/siw/siw_mem.c | 2 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 2 +- drivers/vdpa/vdpa_user/vduse_dev.c | 2 +- drivers/vhost/vdpa.c | 2 +- include/linux/mm.h | 3 +-- io_uring/rsrc.c | 2 +- mm/gup.c | 9 +++------ mm/gup_test.c | 9 ++++----- net/xdp/xdp_umem.c | 2 +- 12 files changed, 17 insertions(+), 22 deletions(-) (limited to 'drivers') diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 81d7185e2ae8..d19fb1f3007d 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -105,7 +105,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n, FOLL_WRITE | FOLL_LONGTERM, - mem->hpages + entry, NULL); + mem->hpages + entry); if (ret == n) { pinned += n; continue; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index f693bc753b6b..1bb7507325bc 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -111,7 +111,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, ret = pin_user_pages(start_page + got * PAGE_SIZE, num_pages - got, FOLL_LONGTERM | FOLL_WRITE, - p + got, NULL); + p + got); if (ret < 0) { mmap_read_unlock(current->mm); goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 2a5cac2658ec..84e0f41e7dfa 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -140,7 +140,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = pin_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + gup_flags, page_list); if (ret < 0) goto out; diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index f51ab2ccf151..e6e25f15567d 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -422,7 +422,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) umem->page_chunk[i].plist = plist; while (nents) { rv = pin_user_pages(first_page_va, nents, foll_flags, - plist, NULL); + plist); if (rv < 0) goto out_sem_up; diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 53001532e8e3..405b89ea1054 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -180,7 +180,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, data, size, dma->nr_pages); err = pin_user_pages(data & PAGE_MASK, dma->nr_pages, gup_flags, - dma->pages, NULL); + dma->pages); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index de97e38c3b82..4d4405f058e8 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1052,7 +1052,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, goto out; pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE, - page_list, NULL); + page_list); if (pinned != npages) { ret = pinned < 0 ? pinned : -ENOMEM; goto out; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 8c1aefc865f0..61223fcbe82b 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -983,7 +983,7 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v, while (npages) { sz2pin = min_t(unsigned long, npages, list_size); pinned = pin_user_pages(cur_base, sz2pin, - gup_flags, page_list, NULL); + gup_flags, page_list); if (sz2pin != pinned) { if (pinned < 0) { ret = pinned; diff --git a/include/linux/mm.h b/include/linux/mm.h index fcbfb961b49f..280429ffa91d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2399,8 +2399,7 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index b6451f8bc5d5..b56bda46a9eb 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1044,7 +1044,7 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) ret = 0; mmap_read_lock(current->mm); pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - pages, NULL); + pages); if (pret == nr_pages) *npages = nr_pages; else diff --git a/mm/gup.c b/mm/gup.c index 764bf0c20827..18e3bc2ee3f1 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3131,8 +3131,6 @@ EXPORT_SYMBOL(pin_user_pages_remote); * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. @@ -3141,15 +3139,14 @@ EXPORT_SYMBOL(pin_user_pages_remote); * see Documentation/core-api/pin_user_pages.rst for details. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + unsigned int gup_flags, struct page **pages) { int locked = 1; - if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN)) + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, &locked, gup_flags); + pages, NULL, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); diff --git a/mm/gup_test.c b/mm/gup_test.c index 9ba8ea23f84e..1668ce0e0783 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -146,18 +146,17 @@ static int __gup_test_ioctl(unsigned int cmd, pages + i); break; case PIN_BASIC_TEST: - nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, - NULL); + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i); break; case PIN_LONGTERM_BENCHMARK: nr = pin_user_pages(addr, nr, gup->gup_flags | FOLL_LONGTERM, - pages + i, NULL); + pages + i); break; case DUMP_USER_PAGES_TEST: if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) nr = pin_user_pages(addr, nr, gup->gup_flags, - pages + i, NULL); + pages + i); else nr = get_user_pages(addr, nr, gup->gup_flags, pages + i); @@ -270,7 +269,7 @@ static inline int pin_longterm_test_start(unsigned long arg) gup_flags, pages); else cur_pages = pin_user_pages(addr, remaining_pages, - gup_flags, pages, NULL); + gup_flags, pages); if (cur_pages < 0) { pin_longterm_test_stop(); ret = cur_pages; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 02207e852d79..06cead2b8e34 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -103,7 +103,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) mmap_read_lock(current->mm); npgs = pin_user_pages(address, umem->npgs, - gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); + gup_flags | FOLL_LONGTERM, &umem->pgs[0]); mmap_read_unlock(current->mm); if (npgs != umem->npgs) { -- cgit v1.2.3 From be6a5b5e9e47a7402083d931e18d594338f1849a Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:49 +0100 Subject: drivers/base: use ARCH_DMA_MINALIGN instead of ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN represents the minimum (static) alignment for safe DMA operations while ARCH_KMALLOC_MINALIGN is the minimum kmalloc() objects alignment. Link: https://lkml.kernel.org/r/20230612153201.554742-6-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Acked-by: Greg Kroah-Hartman Tested-by: Isaac J. Manjarres Cc: "Rafael J. Wysocki" Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: Robin Murphy Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/base/devres.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/base/devres.c b/drivers/base/devres.c index 5c998cfac335..3df0025d12aa 100644 --- a/drivers/base/devres.c +++ b/drivers/base/devres.c @@ -29,10 +29,10 @@ struct devres { * Some archs want to perform DMA into kmalloc caches * and need a guaranteed alignment larger than * the alignment of a 64-bit integer. - * Thus we use ARCH_KMALLOC_MINALIGN here and get exactly the same - * buffer alignment as if it was allocated by plain kmalloc(). + * Thus we use ARCH_DMA_MINALIGN for data[] which will force the same + * alignment for struct devres when allocated by kmalloc(). */ - u8 __aligned(ARCH_KMALLOC_MINALIGN) data[]; + u8 __aligned(ARCH_DMA_MINALIGN) data[]; }; struct devres_group { -- cgit v1.2.3 From 6716ccaf43e0fe2e759b84eb1cef4c684873a847 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:50 +0100 Subject: drivers/gpu: use ARCH_DMA_MINALIGN instead of ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN represents the minimum (static) alignment for safe DMA operations while ARCH_KMALLOC_MINALIGN is the minimum kmalloc() objects alignment. Link: https://lkml.kernel.org/r/20230612153201.554742-7-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Tested-by: Isaac J. Manjarres Cc: Daniel Vetter Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Robin Murphy Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_managed.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/drm_managed.c b/drivers/gpu/drm/drm_managed.c index c21c3f623033..5423ad883729 100644 --- a/drivers/gpu/drm/drm_managed.c +++ b/drivers/gpu/drm/drm_managed.c @@ -49,10 +49,10 @@ struct drmres { * Some archs want to perform DMA into kmalloc caches * and need a guaranteed alignment larger than * the alignment of a 64-bit integer. - * Thus we use ARCH_KMALLOC_MINALIGN here and get exactly the same - * buffer alignment as if it was allocated by plain kmalloc(). + * Thus we use ARCH_DMA_MINALIGN for data[] which will force the same + * alignment for struct drmres when allocated by kmalloc(). */ - u8 __aligned(ARCH_KMALLOC_MINALIGN) data[]; + u8 __aligned(ARCH_DMA_MINALIGN) data[]; }; static void free_dr(struct drmres *dr) -- cgit v1.2.3 From 075efe7c1656932f60384fed36ed05a79c6b114b Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:51 +0100 Subject: drivers/usb: use ARCH_DMA_MINALIGN instead of ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN represents the minimum (static) alignment for safe DMA operations while ARCH_KMALLOC_MINALIGN is the minimum kmalloc() objects alignment. Link: https://lkml.kernel.org/r/20230612153201.554742-8-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Acked-by: Greg Kroah-Hartman Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Robin Murphy Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/usb/core/buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c index fbb087b728dc..e21d8d106977 100644 --- a/drivers/usb/core/buffer.c +++ b/drivers/usb/core/buffer.c @@ -34,13 +34,13 @@ void __init usb_init_pool_max(void) { /* * The pool_max values must never be smaller than - * ARCH_KMALLOC_MINALIGN. + * ARCH_DMA_MINALIGN. */ - if (ARCH_KMALLOC_MINALIGN <= 32) + if (ARCH_DMA_MINALIGN <= 32) ; /* Original value is okay */ - else if (ARCH_KMALLOC_MINALIGN <= 64) + else if (ARCH_DMA_MINALIGN <= 64) pool_max[0] = 64; - else if (ARCH_KMALLOC_MINALIGN <= 128) + else if (ARCH_DMA_MINALIGN <= 128) pool_max[0] = 0; /* Don't use this pool */ else BUILD_BUG(); /* We don't allow this */ -- cgit v1.2.3 From 3cbbb41049accbf2f6f6e79f6870f39b116cd585 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:52 +0100 Subject: drivers/spi: use ARCH_DMA_MINALIGN instead of ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN represents the minimum (static) alignment for safe DMA operations while ARCH_KMALLOC_MINALIGN is the minimum kmalloc() objects alignment. Link: https://lkml.kernel.org/r/20230612153201.554742-9-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Acked-by: Mark Brown Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Robin Murphy Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/spi/spidev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c index 39d94c850839..8d009275a59d 100644 --- a/drivers/spi/spidev.c +++ b/drivers/spi/spidev.c @@ -237,7 +237,7 @@ static int spidev_message(struct spidev_data *spidev, /* Ensure that also following allocations from rx_buf/tx_buf will meet * DMA alignment requirements. */ - unsigned int len_aligned = ALIGN(u_tmp->len, ARCH_KMALLOC_MINALIGN); + unsigned int len_aligned = ALIGN(u_tmp->len, ARCH_DMA_MINALIGN); k_tmp->len = u_tmp->len; -- cgit v1.2.3 From 7bc757140f07ee28cfc70767f5d5bf9f78c621fe Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:53 +0100 Subject: dm-crypt: use ARCH_DMA_MINALIGN instead of ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN represents the minimum (static) alignment for safe DMA operations while ARCH_KMALLOC_MINALIGN is the minimum kmalloc() objects alignment. Link: https://lkml.kernel.org/r/20230612153201.554742-10-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: "Rafael J. Wysocki" Cc: Robin Murphy Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/md/dm-crypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8b47b913ee83..ebbd8f7db880 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3256,7 +3256,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->per_bio_data_size = ti->per_io_data_size = ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, - ARCH_KMALLOC_MINALIGN); + ARCH_DMA_MINALIGN); ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc); if (ret) { -- cgit v1.2.3 From af2880ec44021d32cc72a5aa7c5d7d7beaa722d3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jun 2023 16:31:56 +0100 Subject: scatterlist: add dedicated config for DMA flags The DMA flags field will be useful for users beyond PCI P2P, so upgrade to its own dedicated config option. [catalin.marinas@arm.com: use #ifdef CONFIG_NEED_SG_DMA_FLAGS in scatterlist.h] [catalin.marinas@arm.com: update PCI_P2PDMA dma_flags comment in scatterlist.h] Link: https://lkml.kernel.org/r/20230612153201.554742-13-catalin.marinas@arm.com Signed-off-by: Robin Murphy Signed-off-by: Catalin Marinas Reviewed-by: Christoph Hellwig Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/pci/Kconfig | 1 + include/linux/scatterlist.h | 13 ++++++------- kernel/dma/Kconfig | 3 +++ 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 9309f2469b41..3c07d8d214b3 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -168,6 +168,7 @@ config PCI_P2PDMA # depends on 64BIT select GENERIC_ALLOCATOR + select NEED_SG_DMA_FLAGS help Enableѕ drivers to do PCI peer-to-peer transactions to and from BARs that are exposed in other devices that are the part of diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 375a5e90d86a..19833fd4113b 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -16,7 +16,7 @@ struct scatterlist { #ifdef CONFIG_NEED_SG_DMA_LENGTH unsigned int dma_length; #endif -#ifdef CONFIG_PCI_P2PDMA +#ifdef CONFIG_NEED_SG_DMA_FLAGS unsigned int dma_flags; #endif }; @@ -249,12 +249,11 @@ static inline void sg_unmark_end(struct scatterlist *sg) } /* - * CONFGI_PCI_P2PDMA depends on CONFIG_64BIT which means there is 4 bytes - * in struct scatterlist (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). - * Use this padding for DMA flags bits to indicate when a specific - * dma address is a bus address. + * One 64-bit architectures there is a 4-byte padding in struct scatterlist + * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA + * flags bits to indicate when a specific dma address is a bus address. */ -#ifdef CONFIG_PCI_P2PDMA +#ifdef CONFIG_NEED_SG_DMA_FLAGS #define SG_DMA_BUS_ADDRESS (1 << 0) @@ -312,7 +311,7 @@ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { } -#endif +#endif /* CONFIG_NEED_SG_DMA_FLAGS */ /** * sg_phys - Return physical address of an sg entry diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 6677d0e64d27..acc6f231259c 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -24,6 +24,9 @@ config DMA_OPS_BYPASS config ARCH_HAS_DMA_MAP_DIRECT bool +config NEED_SG_DMA_FLAGS + bool + config NEED_SG_DMA_LENGTH bool -- cgit v1.2.3 From cb147bbe22d2be9b49021c2e5dacdf2935745d1c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jun 2023 16:31:57 +0100 Subject: dma-mapping: name SG DMA flag helpers consistently sg_is_dma_bus_address() is inconsistent with the naming pattern of its corresponding setters and its own kerneldoc, so take the majority vote and rename it sg_dma_is_bus_address() (and fix up the missing underscores in the kerneldoc too). This gives us a nice clear pattern where SG DMA flags are SG_DMA_, and the helpers for acting on them are sg_dma__(). Link: https://lkml.kernel.org/r/20230612153201.554742-14-catalin.marinas@arm.com Signed-off-by: Robin Murphy Signed-off-by: Catalin Marinas Reviewed-by: Christoph Hellwig Reviewed-by: Jerry Snitselaar Reviewed-by: Logan Gunthorpe Link: https://lore.kernel.org/r/fa2eca2862c7ffc41b50337abffb2dfd2864d3ea.1685036694.git.robin.murphy@arm.com Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/iommu/dma-iommu.c | 8 ++++---- drivers/iommu/iommu.c | 2 +- include/linux/scatterlist.h | 8 ++++---- kernel/dma/direct.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7a9f0b0bddbd..b8bba4aa196f 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1080,7 +1080,7 @@ static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents, sg_dma_address(s) = DMA_MAPPING_ERROR; sg_dma_len(s) = 0; - if (sg_is_dma_bus_address(s)) { + if (sg_dma_is_bus_address(s)) { if (i > 0) cur = sg_next(cur); @@ -1136,7 +1136,7 @@ static void __invalidate_sg(struct scatterlist *sg, int nents) int i; for_each_sg(sg, s, nents, i) { - if (sg_is_dma_bus_address(s)) { + if (sg_dma_is_bus_address(s)) { sg_dma_unmark_bus_address(s); } else { if (sg_dma_address(s) != DMA_MAPPING_ERROR) @@ -1329,7 +1329,7 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, * just have to be determined. */ for_each_sg(sg, tmp, nents, i) { - if (sg_is_dma_bus_address(tmp)) { + if (sg_dma_is_bus_address(tmp)) { sg_dma_unmark_bus_address(tmp); continue; } @@ -1343,7 +1343,7 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, nents -= i; for_each_sg(tmp, tmp, nents, i) { - if (sg_is_dma_bus_address(tmp)) { + if (sg_dma_is_bus_address(tmp)) { sg_dma_unmark_bus_address(tmp); continue; } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f1dcfa3f1a1b..eb620552967b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2567,7 +2567,7 @@ ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, len = 0; } - if (sg_is_dma_bus_address(sg)) + if (sg_dma_is_bus_address(sg)) goto next; if (len) { diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 19833fd4113b..2f06178996ba 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -258,7 +258,7 @@ static inline void sg_unmark_end(struct scatterlist *sg) #define SG_DMA_BUS_ADDRESS (1 << 0) /** - * sg_dma_is_bus address - Return whether a given segment was marked + * sg_dma_is_bus_address - Return whether a given segment was marked * as a bus address * @sg: SG entry * @@ -266,13 +266,13 @@ static inline void sg_unmark_end(struct scatterlist *sg) * Returns true if sg_dma_mark_bus_address() has been called on * this segment. **/ -static inline bool sg_is_dma_bus_address(struct scatterlist *sg) +static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return sg->dma_flags & SG_DMA_BUS_ADDRESS; } /** - * sg_dma_mark_bus address - Mark the scatterlist entry as a bus address + * sg_dma_mark_bus_address - Mark the scatterlist entry as a bus address * @sg: SG entry * * Description: @@ -300,7 +300,7 @@ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) #else -static inline bool sg_is_dma_bus_address(struct scatterlist *sg) +static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return false; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 5595d1d5cdcc..d29cade048db 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -463,7 +463,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int i; for_each_sg(sgl, sg, nents, i) { - if (sg_is_dma_bus_address(sg)) + if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else dma_direct_unmap_page(dev, sg->dma_address, -- cgit v1.2.3 From 861370f49ce484cd6ef2e9b3ad06d137f3cb0ca3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:59 +0100 Subject: iommu/dma: force bouncing if the size is not cacheline-aligned Similarly to the direct DMA, bounce small allocations as they may have originated from a kmalloc() cache not safe for DMA. Unlike the direct DMA, iommu_dma_map_sg() cannot call iommu_dma_map_sg_swiotlb() for all non-coherent devices as this would break some cases where the iova is expected to be contiguous (dmabuf). Instead, scan the scatterlist for any small sizes and only go the swiotlb path if any element of the list needs bouncing (note that iommu_dma_map_page() would still only bounce those buffers which are not DMA-aligned). To avoid scanning the scatterlist on the 'sync' operations, introduce an SG_DMA_SWIOTLB flag set by iommu_dma_map_sg_swiotlb(). The dev_use_swiotlb() function together with the newly added dev_use_sg_swiotlb() now check for both untrusted devices and unaligned kmalloc() buffers (suggested by Robin Murphy). Link: https://lkml.kernel.org/r/20230612153201.554742-16-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reviewed-by: Robin Murphy Tested-by: Isaac J. Manjarres Cc: Joerg Roedel Cc: Christoph Hellwig Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/iommu/Kconfig | 1 + drivers/iommu/dma-iommu.c | 50 +++++++++++++++++++++++++++++++++++++-------- include/linux/scatterlist.h | 41 +++++++++++++++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 6de900776e24..74c45359869d 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -152,6 +152,7 @@ config IOMMU_DMA select IOMMU_IOVA select IRQ_MSI_IOMMU select NEED_SG_DMA_LENGTH + select NEED_SG_DMA_FLAGS if SWIOTLB # Shared Virtual Addressing config IOMMU_SVA diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index b8bba4aa196f..e86ae462cade 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -520,9 +520,38 @@ static bool dev_is_untrusted(struct device *dev) return dev_is_pci(dev) && to_pci_dev(dev)->untrusted; } -static bool dev_use_swiotlb(struct device *dev) +static bool dev_use_swiotlb(struct device *dev, size_t size, + enum dma_data_direction dir) { - return IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev); + return IS_ENABLED(CONFIG_SWIOTLB) && + (dev_is_untrusted(dev) || + dma_kmalloc_needs_bounce(dev, size, dir)); +} + +static bool dev_use_sg_swiotlb(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + struct scatterlist *s; + int i; + + if (!IS_ENABLED(CONFIG_SWIOTLB)) + return false; + + if (dev_is_untrusted(dev)) + return true; + + /* + * If kmalloc() buffers are not DMA-safe for this device and + * direction, check the individual lengths in the sg list. If any + * element is deemed unsafe, use the swiotlb for bouncing. + */ + if (!dma_kmalloc_safe(dev, dir)) { + for_each_sg(sg, s, nents, i) + if (!dma_kmalloc_size_aligned(s->length)) + return true; + } + + return false; } /** @@ -922,7 +951,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev, { phys_addr_t phys; - if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev)) + if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev, size, dir)) return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); @@ -938,7 +967,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev, { phys_addr_t phys; - if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev)) + if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev, size, dir)) return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); @@ -956,7 +985,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg; int i; - if (dev_use_swiotlb(dev)) + if (sg_dma_is_swiotlb(sgl)) for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg), sg->length, dir); @@ -972,7 +1001,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg; int i; - if (dev_use_swiotlb(dev)) + if (sg_dma_is_swiotlb(sgl)) for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_device(dev, sg_dma_address(sg), @@ -998,7 +1027,8 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, * If both the physical buffer start address and size are * page aligned, we don't need to use a bounce page. */ - if (dev_use_swiotlb(dev) && iova_offset(iovad, phys | size)) { + if (dev_use_swiotlb(dev, size, dir) && + iova_offset(iovad, phys | size)) { void *padding_start; size_t padding_size, aligned_size; @@ -1166,6 +1196,8 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg, struct scatterlist *s; int i; + sg_dma_mark_swiotlb(sg); + for_each_sg(sg, s, nents, i) { sg_dma_address(s) = iommu_dma_map_page(dev, sg_page(s), s->offset, s->length, dir, attrs); @@ -1210,7 +1242,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, goto out; } - if (dev_use_swiotlb(dev)) + if (dev_use_sg_swiotlb(dev, sg, nents, dir)) return iommu_dma_map_sg_swiotlb(dev, sg, nents, dir, attrs); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) @@ -1315,7 +1347,7 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, struct scatterlist *tmp; int i; - if (dev_use_swiotlb(dev)) { + if (sg_dma_is_swiotlb(sg)) { iommu_dma_unmap_sg_swiotlb(dev, sg, nents, dir, attrs); return; } diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 2f06178996ba..ec46d8e8e49d 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -251,11 +251,13 @@ static inline void sg_unmark_end(struct scatterlist *sg) /* * One 64-bit architectures there is a 4-byte padding in struct scatterlist * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA - * flags bits to indicate when a specific dma address is a bus address. + * flags bits to indicate when a specific dma address is a bus address or the + * buffer may have been bounced via SWIOTLB. */ #ifdef CONFIG_NEED_SG_DMA_FLAGS -#define SG_DMA_BUS_ADDRESS (1 << 0) +#define SG_DMA_BUS_ADDRESS (1 << 0) +#define SG_DMA_SWIOTLB (1 << 1) /** * sg_dma_is_bus_address - Return whether a given segment was marked @@ -298,6 +300,34 @@ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) sg->dma_flags &= ~SG_DMA_BUS_ADDRESS; } +/** + * sg_dma_is_swiotlb - Return whether the scatterlist was marked for SWIOTLB + * bouncing + * @sg: SG entry + * + * Description: + * Returns true if the scatterlist was marked for SWIOTLB bouncing. Not all + * elements may have been bounced, so the caller would have to check + * individual SG entries with is_swiotlb_buffer(). + */ +static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) +{ + return sg->dma_flags & SG_DMA_SWIOTLB; +} + +/** + * sg_dma_mark_swiotlb - Mark the scatterlist for SWIOTLB bouncing + * @sg: SG entry + * + * Description: + * Marks a a scatterlist for SWIOTLB bounce. Not all SG entries may be + * bounced. + */ +static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) +{ + sg->dma_flags |= SG_DMA_SWIOTLB; +} + #else static inline bool sg_dma_is_bus_address(struct scatterlist *sg) @@ -310,6 +340,13 @@ static inline void sg_dma_mark_bus_address(struct scatterlist *sg) static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { } +static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) +{ + return false; +} +static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) +{ +} #endif /* CONFIG_NEED_SG_DMA_FLAGS */ -- cgit v1.2.3 From c33c794828f21217f72ce6fc140e0d34e0d56bff Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 12 Jun 2023 16:15:45 +0100 Subject: mm: ptep_get() conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexander Shishkin Cc: Alex Williamson Cc: Al Viro Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christian Brauner Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dave Airlie Cc: Dimitri Sivanich Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Jason Gunthorpe Cc: Jérôme Glisse Cc: Jiri Olsa Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Namhyung Kim Cc: Naoya Horiguchi Cc: Oleksandr Tyshchenko Cc: Pavel Tatashin Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Cc: Yu Zhao Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c | 8 +- drivers/misc/sgi-gru/grufault.c | 2 +- drivers/vfio/vfio_iommu_type1.c | 7 +- drivers/xen/privcmd.c | 2 +- fs/proc/task_mmu.c | 33 +++---- fs/userfaultfd.c | 6 +- include/linux/hugetlb.h | 4 + include/linux/mm_inline.h | 2 +- include/linux/pgtable.h | 6 +- kernel/events/uprobes.c | 2 +- mm/damon/ops-common.c | 2 +- mm/damon/paddr.c | 2 +- mm/damon/vaddr.c | 10 ++- mm/filemap.c | 2 +- mm/gup.c | 21 +++-- mm/highmem.c | 12 +-- mm/hmm.c | 2 +- mm/huge_memory.c | 4 +- mm/hugetlb.c | 2 +- mm/hugetlb_vmemmap.c | 6 +- mm/kasan/init.c | 9 +- mm/kasan/shadow.c | 10 +-- mm/khugepaged.c | 22 ++--- mm/ksm.c | 22 ++--- mm/madvise.c | 6 +- mm/mapping_dirty_helpers.c | 4 +- mm/memcontrol.c | 4 +- mm/memory-failure.c | 26 +++--- mm/memory.c | 100 +++++++++++---------- mm/mempolicy.c | 6 +- mm/migrate.c | 14 +-- mm/migrate_device.c | 15 ++-- mm/mincore.c | 2 +- mm/mlock.c | 6 +- mm/mprotect.c | 8 +- mm/mremap.c | 2 +- mm/page_table_check.c | 4 +- mm/page_vma_mapped.c | 27 +++--- mm/pgtable-generic.c | 2 +- mm/rmap.c | 34 ++++--- mm/sparse-vmemmap.c | 8 +- mm/swap_state.c | 8 +- mm/swapfile.c | 20 +++-- mm/userfaultfd.c | 4 +- mm/vmalloc.c | 6 +- mm/vmscan.c | 14 +-- virt/kvm/kvm_main.c | 11 ++- 47 files changed, 301 insertions(+), 228 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c index 56279908ed30..01e271b6ad21 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c @@ -1681,7 +1681,9 @@ static int igt_mmap_gpu(void *arg) static int check_present_pte(pte_t *pte, unsigned long addr, void *data) { - if (!pte_present(*pte) || pte_none(*pte)) { + pte_t ptent = ptep_get(pte); + + if (!pte_present(ptent) || pte_none(ptent)) { pr_err("missing PTE:%lx\n", (addr - (unsigned long)data) >> PAGE_SHIFT); return -EINVAL; @@ -1692,7 +1694,9 @@ static int check_present_pte(pte_t *pte, unsigned long addr, void *data) static int check_absent_pte(pte_t *pte, unsigned long addr, void *data) { - if (pte_present(*pte) && !pte_none(*pte)) { + pte_t ptent = ptep_get(pte); + + if (pte_present(ptent) && !pte_none(ptent)) { pr_err("present PTE:%lx; expected to be revoked\n", (addr - (unsigned long)data) >> PAGE_SHIFT); return -EINVAL; diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index 378cf02a2aa1..629edb6486de 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -228,7 +228,7 @@ static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr, goto err; #ifdef CONFIG_X86_64 if (unlikely(pmd_large(*pmdp))) - pte = *(pte_t *) pmdp; + pte = ptep_get((pte_t *)pmdp); else #endif pte = *pte_offset_kernel(pmdp, vaddr); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 306e6f1d1c70..ebe0ad31d0b0 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -514,6 +514,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, bool write_fault) { pte_t *ptep; + pte_t pte; spinlock_t *ptl; int ret; @@ -536,10 +537,12 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, return ret; } - if (write_fault && !pte_write(*ptep)) + pte = ptep_get(ptep); + + if (write_fault && !pte_write(pte)) ret = -EFAULT; else - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(pte); pte_unmap_unlock(ptep, ptl); return ret; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index e2f580e30a86..f447cd37cc4c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -949,7 +949,7 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) */ static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) { - return pte_none(*pte) ? 0 : -EBUSY; + return pte_none(ptep_get(pte)) ? 0 : -EBUSY; } static int privcmd_vma_range_is_mapped( diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0d63b6a0f0d8..507cd4e59d07 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -538,13 +538,14 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool migration = false, young = false, dirty = false; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - young = pte_young(*pte); - dirty = pte_dirty(*pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + young = pte_young(ptent); + dirty = pte_dirty(ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { int mapcount; @@ -732,11 +733,12 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; struct page *page = NULL; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); @@ -1105,7 +1107,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { pte_t old_pte; @@ -1194,7 +1196,7 @@ out: return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); @@ -1550,7 +1552,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1893,10 +1895,11 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, return 0; } do { - struct page *page = can_gather_numa_stats(*pte, vma, addr); + pte_t ptent = ptep_get(pte); + struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; - gather_stats(page, md, pte_dirty(*pte), 1); + gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ca83423f8d54..478e2b169c13 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -335,6 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; + pte_t ptent; bool ret = true; mmap_assert_locked(mm); @@ -374,9 +375,10 @@ again: * changes under us. PTE markers should be handled the same as none * ptes here. */ - if (pte_none_mostly(*pte)) + ptent = ptep_get(pte); + if (pte_none_mostly(ptent)) ret = true; - if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + if (!pte_write(ptent) && (reason & VM_UFFD_WP)) ret = true; pte_unmap(pte); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 21f942025fec..beb7c63d2871 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1185,7 +1185,11 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_MMU + return ptep_get(ptep); +#else return *ptep; +#endif } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 0e1d239a882c..08c2bcefcb2b 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -555,7 +555,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ - WARN_ON_ONCE(!pte_none(*pte)); + WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index fc06f6419661..5063b482e34f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -231,7 +231,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; @@ -318,7 +318,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, address, pte); return pte; @@ -519,7 +519,7 @@ extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t old_pte = *ptep; + pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 607d742caa61..f0ac5b874919 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush_notify(vma, addr, pvmw.pte); if (new_page) set_pte_at_notify(mm, addr, pvmw.pte, diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index d4ab81229136..e940802a15a4 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -39,7 +39,7 @@ struct folio *damon_get_folio(unsigned long pfn) void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = damon_get_folio(pte_pfn(*pte)); + struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte))); if (!folio) return; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 5b3a3463d078..40801e38fcf0 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -89,7 +89,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - *accessed = pte_young(*pvmw.pte) || + *accessed = pte_young(ptep_get(pvmw.pte)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); } else { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index e814f66dfc2e..2fcc9731528a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -323,7 +323,7 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } - if (!pte_present(*pte)) + if (!pte_present(ptep_get(pte))) goto out; damon_ptep_mkold(pte, walk->vma, addr); out: @@ -433,6 +433,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; + pte_t ptent; spinlock_t *ptl; struct folio *folio; struct damon_young_walk_private *priv = walk->private; @@ -471,12 +472,13 @@ regular_page: walk->action = ACTION_AGAIN; return 0; } - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) goto out; - folio = damon_get_folio(pte_pfn(*pte)); + folio = damon_get_folio(pte_pfn(ptent)); if (!folio) goto out; - if (pte_young(*pte) || !folio_test_idle(folio) || + if (pte_young(ptent) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = folio_size(folio); diff --git a/mm/filemap.c b/mm/filemap.c index 1893048ec9ff..00933089b8b6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3523,7 +3523,7 @@ again: * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(*vmf->pte)) + if (!pte_none(ptep_get(vmf->pte))) goto unlock; /* We're about to handle the fault */ diff --git a/mm/gup.c b/mm/gup.c index 838db6c0bfc2..38986e522d34 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -477,13 +477,14 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { if (flags & FOLL_TOUCH) { - pte_t entry = *pte; + pte_t orig_entry = ptep_get(pte); + pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); - if (!pte_same(*pte, entry)) { + if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } @@ -549,7 +550,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags); - pte = *ptep; + pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) @@ -821,6 +822,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t entry; int ret = -EFAULT; /* user gate pages are read-only */ @@ -844,16 +846,17 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; - if (pte_none(*pte)) + entry = ptep_get(pte); + if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; - *page = vm_normal_page(*vma, address, *pte); + *page = vm_normal_page(*vma, address, entry); if (!*page) { - if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; - *page = pte_page(*pte); + *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) @@ -2496,7 +2499,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || - unlikely(pte_val(pte) != pte_val(*ptep))) { + unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2693,7 +2696,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if (!folio) return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { + if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); return 0; } diff --git a/mm/highmem.c b/mm/highmem.c index db251e77f98f..e19269093a93 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -161,7 +161,7 @@ struct page *__kmap_to_page(void *vaddr) /* kmap() mappings */ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP))) - return pte_page(pkmap_page_table[PKMAP_NR(addr)]); + return pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(addr)])); /* kmap_local_page() mappings */ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && @@ -191,6 +191,7 @@ static void flush_all_zero_pkmaps(void) for (i = 0; i < LAST_PKMAP; i++) { struct page *page; + pte_t ptent; /* * zero means we don't have anything to do, @@ -203,7 +204,8 @@ static void flush_all_zero_pkmaps(void) pkmap_count[i] = 0; /* sanity check */ - BUG_ON(pte_none(pkmap_page_table[i])); + ptent = ptep_get(&pkmap_page_table[i]); + BUG_ON(pte_none(ptent)); /* * Don't need an atomic fetch-and-clear op here; @@ -212,7 +214,7 @@ static void flush_all_zero_pkmaps(void) * getting the kmap_lock (which is held here). * So no dangers, even with speculative execution. */ - page = pte_page(pkmap_page_table[i]); + page = pte_page(ptent); pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); set_page_address(page, NULL); @@ -511,7 +513,7 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr) { #ifdef ARCH_NEEDS_KMAP_HIGH_GET if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { - kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); + kunmap_high(pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(vaddr)]))); return true; } #endif @@ -548,7 +550,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); kmap_pte = kmap_get_pte(vaddr, idx); - BUG_ON(!pte_none(*kmap_pte)); + BUG_ON(!pte_none(ptep_get(kmap_pte))); pteval = pfn_pte(pfn, prot); arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval); arch_kmap_local_post_map(vaddr, pteval); diff --git a/mm/hmm.c b/mm/hmm.c index b1a9159d7c92..855e25e59d8f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -228,7 +228,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; unsigned long cpu_flags; - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; if (pte_none_mostly(pte)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76f970aa5b4d..e94fe292f30a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2063,7 +2063,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); - VM_BUG_ON(!pte_none(*pte)); + VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } @@ -2257,7 +2257,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mkuffd_wp(entry); page_add_anon_rmap(page + i, vma, addr, false); } - VM_BUG_ON(!pte_none(*pte)); + VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d3d8a61b336..d76574425da3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7246,7 +7246,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte = (pte_t *)pmd_alloc(mm, pud, addr); } } - BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); + BUG_ON(pte && pte_present(ptep_get(pte)) && !pte_huge(ptep_get(pte))); return pte; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f42079b73f82..c2007ef5e9b0 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -105,7 +105,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, * remapping (which is calling @walk->remap_pte). */ if (!walk->reuse_page) { - walk->reuse_page = pte_page(*pte); + walk->reuse_page = pte_page(ptep_get(pte)); /* * Because the reuse address is part of the range that we are * walking, skip the reuse address range. @@ -239,7 +239,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; - struct page *page = pte_page(*pte); + struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ @@ -286,7 +286,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct page *page; void *to; - BUG_ON(pte_page(*pte) != walk->reuse_page); + BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); diff --git a/mm/kasan/init.c b/mm/kasan/init.c index cc64ed6858c6..dcfec277e839 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -286,7 +286,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) for (i = 0; i < PTRS_PER_PTE; i++) { pte = pte_start + i; - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return; } @@ -343,16 +343,19 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, unsigned long end) { unsigned long next; + pte_t ptent; for (; addr < end; addr = next, pte++) { next = (addr + PAGE_SIZE) & PAGE_MASK; if (next > end) next = end; - if (!pte_present(*pte)) + ptent = ptep_get(pte); + + if (!pte_present(ptent)) continue; - if (WARN_ON(!kasan_early_shadow_page_entry(*pte))) + if (WARN_ON(!kasan_early_shadow_page_entry(ptent))) continue; pte_clear(&init_mm, addr, pte); } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 3e62728ae25d..dd772f9d0f08 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -226,7 +226,7 @@ static bool shadow_mapped(unsigned long addr) if (pmd_bad(*pmd)) return true; pte = pte_offset_kernel(pmd, addr); - return !pte_none(*pte); + return !pte_none(ptep_get(pte)); } static int __meminit kasan_mem_notifier(struct notifier_block *nb, @@ -317,7 +317,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, unsigned long page; pte_t pte; - if (likely(!pte_none(*ptep))) + if (likely(!pte_none(ptep_get(ptep)))) return 0; page = __get_free_page(GFP_KERNEL); @@ -328,7 +328,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); - if (likely(pte_none(*ptep))) { + if (likely(pte_none(ptep_get(ptep)))) { set_pte_at(&init_mm, addr, ptep, pte); page = 0; } @@ -418,11 +418,11 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, { unsigned long page; - page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); + page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); spin_lock(&init_mm.page_table_lock); - if (likely(!pte_none(*ptep))) { + if (likely(!pte_none(ptep_get(ptep)))) { pte_clear(&init_mm, addr, ptep); free_page(page); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 881669e738c0..0b4f00712895 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -511,7 +511,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, struct folio *folio, *tmp; while (--_pte >= pte) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) @@ -555,7 +555,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { ++none_or_zero; @@ -699,7 +699,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (is_zero_pfn(pte_pfn(pteval))) { @@ -797,7 +797,7 @@ static int __collapse_huge_page_copy(pte_t *pte, */ for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; _pte++, page++, _address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, _address); continue; @@ -1274,7 +1274,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; if (!cc->is_khugepaged || @@ -1650,18 +1650,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); /* empty pte, skip */ - if (pte_none(*pte)) + if (pte_none(ptent)) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) { + if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* @@ -1677,10 +1678,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); - if (pte_none(*pte)) + if (pte_none(ptent)) continue; - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) goto abort; page_remove_rmap(page, vma, false); diff --git a/mm/ksm.c b/mm/ksm.c index 3dc15459dd20..d995779dc1fe 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -429,15 +429,17 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex struct page *page = NULL; spinlock_t *ptl; pte_t *pte; + pte_t ptent; int ret; pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) return 0; - if (pte_present(*pte)) { - page = vm_normal_page(walk->vma, addr, *pte); - } else if (!pte_none(*pte)) { - swp_entry_t entry = pte_to_swp_entry(*pte); + ptent = ptep_get(pte); + if (pte_present(ptent)) { + page = vm_normal_page(walk->vma, addr, ptent); + } else if (!pte_none(ptent)) { + swp_entry_t entry = pte_to_swp_entry(ptent); /* * As KSM pages remain KSM pages until freed, no need to wait @@ -1085,6 +1087,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; + pte_t entry; pvmw.address = page_address_in_vma(page, vma); if (pvmw.address == -EFAULT) @@ -1102,10 +1105,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; anon_exclusive = PageAnonExclusive(page); - if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || + entry = ptep_get(pvmw.pte); + if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { - pte_t entry; - swapped = PageSwapCache(page); flush_cache_page(vma, pvmw.address, page_to_pfn(page)); /* @@ -1147,7 +1149,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } - *orig_pte = *pvmw.pte; + *orig_pte = entry; err = 0; out_unlock: @@ -1204,7 +1206,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; - if (!pte_same(*ptep, orig_pte)) { + if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } @@ -1231,7 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, dec_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. diff --git a/mm/madvise.c b/mm/madvise.c index 9b3c9610052f..886f06066622 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -207,7 +207,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, break; } - pte = *ptep; + pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); @@ -438,7 +438,7 @@ regular_folio: flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; @@ -642,7 +642,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 87b4beeda4fa..a26dd8bcfcdb 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -35,7 +35,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct wp_walk *wpwalk = walk->private; - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_write(ptent)) { pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); @@ -91,7 +91,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, { struct wp_walk *wpwalk = walk->private; struct clean_walk *cwalk = to_clean_walk(wpwalk); - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_dirty(ptent)) { pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 77d8d2d14fcf..93056918e956 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6025,7 +6025,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, if (!pte) return 0; for (; addr != end; pte++, addr += PAGE_SIZE) - if (get_mctgt_type(vma, addr, *pte, NULL)) + if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) mc.precharge++; /* increment precharge temporarily */ pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -6246,7 +6246,7 @@ retry: if (!pte) return 0; for (; addr != end; addr += PAGE_SIZE) { - pte_t ptent = *(pte++); + pte_t ptent = ptep_get(pte++); bool device = false; swp_entry_t ent; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d5116f0eb1b6..e245191e6b04 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -6,16 +6,16 @@ * High level machine check handler. Handles pages reported by the * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. - * + * * In addition there is a "soft offline" entry point that allows stop using * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronously in respect to - * other VM users, because memory failures could happen anytime and - * anywhere. This could violate some of their assumptions. This is why - * this code has to be extremely careful. Generally it tries to use - * normal locking rules, as in get the standard locks, even if that means + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. * * It can be very tempting to add handling for obscure cases here. @@ -25,12 +25,12 @@ * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ * - The case actually shows up as a frequent (top 10) page state in * tools/mm/page-types when running a real workload. - * + * * There are several operations here with exponential complexity because - * of unsuitable VM data structures. For example the operation to map back - * from RMAP chains to processes has to walk the complete process list and + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and * has non linear complexity with the number. But since memory corruptions - * are rare we hope to get away with this. This avoids impacting the core + * are rare we hope to get away with this. This avoids impacting the core * VM. */ @@ -386,6 +386,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t ptent; VM_BUG_ON_VMA(address == -EFAULT, vma); pgd = pgd_offset(vma->vm_mm, address); @@ -407,7 +408,8 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pte = pte_offset_map(pmd, address); if (!pte) return 0; - if (pte_present(*pte) && pte_devmap(*pte)) + ptent = ptep_get(pte); + if (pte_present(ptent) && pte_devmap(ptent)) ret = PAGE_SHIFT; pte_unmap(pte); return ret; @@ -799,7 +801,7 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, goto out; for (; addr != end; ptep++, addr += PAGE_SIZE) { - ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT, hwp->pfn, &hwp->tk); if (ret == 1) break; diff --git a/mm/memory.c b/mm/memory.c index 63c30f58142b..3d78b552866d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -699,15 +699,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, struct page *page, unsigned long address, pte_t *ptep) { + pte_t orig_pte; pte_t pte; swp_entry_t entry; + orig_pte = ptep_get(ptep); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); - if (pte_swp_soft_dirty(*ptep)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*ptep); - if (pte_swp_uffd_wp(*ptep)) + entry = pte_to_swp_entry(orig_pte); + if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -744,7 +746,7 @@ static int try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr) { - swp_entry_t entry = pte_to_swp_entry(*src_pte); + swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); struct page *page = pfn_swap_entry_to_page(entry); if (trylock_page(page)) { @@ -768,9 +770,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; - pte_t pte = *src_pte; + pte_t orig_pte = ptep_get(src_pte); + pte_t pte = orig_pte; struct page *page; - swp_entry_t entry = pte_to_swp_entry(pte); + swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) @@ -785,8 +788,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ - if (pte_swp_exclusive(*src_pte)) { - pte = pte_swp_clear_exclusive(*src_pte); + if (pte_swp_exclusive(orig_pte)) { + pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; @@ -805,9 +808,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -840,7 +843,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -904,7 +907,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); - if (userfaultfd_pte_wp(dst_vma, *src_pte)) + if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); @@ -922,7 +925,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; - pte_t pte = *src_pte; + pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; @@ -1002,6 +1005,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; + pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, ret = 0; int rss[NR_MM_COUNTERS]; @@ -1047,17 +1051,18 @@ again: spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } - if (pte_none(*src_pte)) { + ptent = ptep_get(src_pte); + if (pte_none(ptent)) { progress++; continue; } - if (unlikely(!pte_present(*src_pte))) { + if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(*src_pte); + entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -1407,7 +1412,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); struct page *page; if (pte_none(ptent)) @@ -1822,7 +1827,7 @@ static int validate_page_before_insert(struct page *page) static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); @@ -2116,7 +2121,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; - if (!pte_none(*pte)) { + entry = ptep_get(pte); + if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed @@ -2128,11 +2134,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); + if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } - entry = pte_mkyoung(*pte); + entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); @@ -2344,7 +2350,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return -ENOMEM; arch_enter_lazy_mmu_mode(); do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; @@ -2585,7 +2591,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { - if (create || !pte_none(*pte)) { + if (create || !pte_none(ptep_get(pte))) { err = fn(pte++, addr, data); if (err) break; @@ -2787,7 +2793,7 @@ static inline int pte_unmap_same(struct vm_fault *vmf) #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spin_lock(vmf->ptl); - same = pte_same(*vmf->pte, vmf->orig_pte); + same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); spin_unlock(vmf->ptl); } #endif @@ -2838,7 +2844,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only @@ -2866,7 +2872,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); @@ -3114,7 +3120,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) { + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(&old_folio->page)); @@ -3241,7 +3247,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) * We might have raced with another page fault while we released the * pte_offset_map_lock. */ - if (!pte_same(*vmf->pte, vmf->orig_pte)) { + if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; @@ -3336,7 +3342,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) struct folio *folio = NULL; if (likely(!unshare)) { - if (userfaultfd_pte_wp(vma, *vmf->pte)) { + if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } @@ -3598,7 +3604,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); if (vmf->pte) @@ -3643,7 +3649,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. * So is_pte_marker() check is not enough to safely drop the pte. */ - if (pte_same(vmf->orig_pte, *vmf->pte)) + if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3739,7 +3745,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || - !pte_same(*vmf->pte, vmf->orig_pte))) + !pte_same(ptep_get(vmf->pte), + vmf->orig_pte))) goto unlock; /* @@ -3816,7 +3823,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && + pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } @@ -3886,7 +3894,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { @@ -4331,9 +4339,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) - return !pte_same(*vmf->pte, vmf->orig_pte); + return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); - return !pte_none(*vmf->pte); + return !pte_none(ptep_get(vmf->pte)); } /** @@ -4643,7 +4651,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) * we don't have concurrent modification by hardware * followed by an update. */ - if (unlikely(pte_none(*vmf->pte))) + if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; @@ -4699,7 +4707,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * the pfn may be screwed if the read is non atomic. */ spin_lock(vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4772,7 +4780,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) goto out; - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4930,7 +4938,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) spin_lock(vmf->ptl); entry = vmf->orig_pte; - if (unlikely(!pte_same(*vmf->pte, entry))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } @@ -5416,7 +5424,7 @@ int follow_pte(struct mm_struct *mm, unsigned long address, ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!ptep) goto out; - if (!pte_present(*ptep)) + if (!pte_present(ptep_get(ptep))) goto unlock; *ptepp = ptep; return 0; @@ -5453,7 +5461,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(ptep_get(ptep)); pte_unmap_unlock(ptep, ptl); return 0; } @@ -5473,7 +5481,7 @@ int follow_phys(struct vm_area_struct *vma, if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; - pte = *ptep; + pte = ptep_get(ptep); if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -5517,7 +5525,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) return -EINVAL; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); prot = pgprot_val(pte_pgprot(pte)); @@ -5533,7 +5541,7 @@ retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) goto out_unmap; - if (!pte_same(pte, *ptep)) { + if (!pte_same(pte, ptep_get(ptep))) { pte_unmap_unlock(ptep, ptl); iounmap(maddr); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0241bb64978b..edc25195f5bd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -508,6 +508,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long flags = qp->flags; bool has_unmovable = false; pte_t *pte, *mapped_pte; + pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); @@ -520,9 +521,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* diff --git a/mm/migrate.c b/mm/migrate.c index 363562992046..ce35afdbc1e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -188,6 +188,7 @@ static bool remove_migration_pte(struct folio *folio, while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; + pte_t old_pte; pte_t pte; swp_entry_t entry; struct page *new; @@ -210,17 +211,18 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pvmw.pte)) + old_pte = ptep_get(pvmw.pte); + if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*pvmw.pte); + entry = pte_to_swp_entry(old_pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) pte = pte_mkwrite(pte); - else if (pte_swp_uffd_wp(*pvmw.pte)) + else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) @@ -234,9 +236,9 @@ static bool remove_migration_pte(struct folio *folio, entry = make_readable_device_private_entry( page_to_pfn(new)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*pvmw.pte)) + if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*pvmw.pte)) + if (pte_swp_uffd_wp(old_pte)) pte = pte_swp_mkuffd_wp(pte); } @@ -308,7 +310,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, if (!ptep) return; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap(ptep); if (!is_swap_pte(pte)) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a14af6b12b04..02d272b909b5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -111,7 +111,7 @@ again: swp_entry_t entry; pte_t pte; - pte = *ptep; + pte = ptep_get(ptep); if (pte_none(pte)) { if (vma_is_anonymous(vma)) { @@ -194,7 +194,7 @@ again: bool anon_exclusive; pte_t swp_pte; - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(pte)); anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) { pte = ptep_clear_flush(vma, addr, ptep); @@ -573,6 +573,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pud_t *pudp; pmd_t *pmdp; pte_t *ptep; + pte_t orig_pte; /* Only allow populating anonymous memory */ if (!vma_is_anonymous(vma)) @@ -628,16 +629,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); if (!ptep) goto abort; + orig_pte = ptep_get(ptep); + if (check_stable_address_space(mm)) goto unlock_abort; - if (pte_present(*ptep)) { - unsigned long pfn = pte_pfn(*ptep); + if (pte_present(orig_pte)) { + unsigned long pfn = pte_pfn(orig_pte); if (!is_zero_pfn(pfn)) goto unlock_abort; flush = true; - } else if (!pte_none(*ptep)) + } else if (!pte_none(orig_pte)) goto unlock_abort; /* @@ -654,7 +657,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, get_page(page); if (flush) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(orig_pte)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, entry); update_mmu_cache(vma, addr, ptep); diff --git a/mm/mincore.c b/mm/mincore.c index f33f6a0b1ded..b7f7a516b26c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -119,7 +119,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } for (; addr != end; ptep++, addr += PAGE_SIZE) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) diff --git a/mm/mlock.c b/mm/mlock.c index 9f2b1173b1b1..d7db94519884 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -312,6 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; + pte_t ptent; struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); @@ -334,9 +335,10 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, return 0; } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio)) diff --git a/mm/mprotect.c b/mm/mprotect.c index 64e1df0af514..327a6eb90afb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -105,7 +105,7 @@ static long change_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { - oldpte = *pte; + oldpte = ptep_get(pte); if (pte_present(oldpte)) { pte_t ptent; @@ -544,7 +544,8 @@ long change_protection(struct mmu_gather *tlb, static int prot_none_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } @@ -552,7 +553,8 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } diff --git a/mm/mremap.c b/mm/mremap.c index bfc3d1902a94..8ec184ac90ff 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -188,7 +188,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { - if (pte_none(*old_pte)) + if (pte_none(ptep_get(old_pte))) continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 0c511330dbc9..8f89f9c8f0df 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -190,7 +190,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { page_table_check_set(mm, addr, pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, @@ -243,7 +243,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 2af734274073..49e0d28f0379 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -15,6 +15,8 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) { + pte_t ptent; + if (pvmw->flags & PVMW_SYNC) { /* Use the stricter lookup */ pvmw->pte = pte_offset_map_lock(pvmw->vma->vm_mm, pvmw->pmd, @@ -35,10 +37,12 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) if (!pvmw->pte) return false; + ptent = ptep_get(pvmw->pte); + if (pvmw->flags & PVMW_MIGRATION) { - if (!is_swap_pte(*pvmw->pte)) + if (!is_swap_pte(ptent)) return false; - } else if (is_swap_pte(*pvmw->pte)) { + } else if (is_swap_pte(ptent)) { swp_entry_t entry; /* * Handle un-addressable ZONE_DEVICE memory. @@ -56,11 +60,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) * For more details on device private memory see HMM * (include/linux/hmm.h or mm/hmm.c). */ - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; - } else if (!pte_present(*pvmw->pte)) { + } else if (!pte_present(ptent)) { return false; } pvmw->ptl = *ptlp; @@ -90,33 +94,34 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) static bool check_pte(struct page_vma_mapped_walk *pvmw) { unsigned long pfn; + pte_t ptent = ptep_get(pvmw->pte); if (pvmw->flags & PVMW_MIGRATION) { swp_entry_t entry; - if (!is_swap_pte(*pvmw->pte)) + if (!is_swap_pte(ptent)) return false; - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_migration_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); - } else if (is_swap_pte(*pvmw->pte)) { + } else if (is_swap_pte(ptent)) { swp_entry_t entry; /* Handle un-addressable ZONE_DEVICE memory */ - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); } else { - if (!pte_present(*pvmw->pte)) + if (!pte_present(ptent)) return false; - pfn = pte_pfn(*pvmw->pte); + pfn = pte_pfn(ptent); } return (pfn - pvmw->pfn) < pvmw->nr_pages; @@ -294,7 +299,7 @@ next_pte: goto restart; } pvmw->pte++; - } while (pte_none(*pvmw->pte)); + } while (pte_none(ptep_get(pvmw->pte))); if (!pvmw->ptl) { pvmw->ptl = ptl; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c7ab18a5fb77..4d454953046f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -68,7 +68,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { - int changed = !pte_same(*ptep, entry); + int changed = !pte_same(ptep_get(ptep), entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address, ptep); diff --git a/mm/rmap.c b/mm/rmap.c index cd918cb9a431..0c0d8857dfce 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -826,7 +826,8 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { - if (lru_gen_enabled() && pte_young(*pvmw.pte)) { + if (lru_gen_enabled() && + pte_young(ptep_get(pvmw.pte))) { lru_gen_look_around(&pvmw); referenced++; } @@ -956,13 +957,13 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) address = pvmw->address; if (pvmw->pte) { - pte_t entry; pte_t *pte = pvmw->pte; + pte_t entry = ptep_get(pte); - if (!pte_dirty(*pte) && !pte_write(*pte)) + if (!pte_dirty(entry) && !pte_write(entry)) continue; - flush_cache_page(vma, address, pte_pfn(*pte)); + flush_cache_page(vma, address, pte_pfn(entry)); entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); @@ -1137,7 +1138,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * @folio: Folio which contains page. * @page: Page to add to rmap. * @vma: VM area to add page to. - * @address: User virtual address of the mapping + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct folio *folio, struct page *page, @@ -1458,6 +1459,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1508,8 +1510,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pfn = pte_pfn(ptep_get(pvmw.pte)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -1571,7 +1573,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -1818,6 +1820,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1877,6 +1880,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); + pfn = pte_pfn(ptep_get(pvmw.pte)); + if (folio_is_zone_device(folio)) { /* * Our PTE is a non-present device exclusive entry and @@ -1891,8 +1896,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); subpage = &folio->page; } else { - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); } address = pvmw.address; anon_exclusive = folio_test_anon(folio) && @@ -1952,7 +1956,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -2187,6 +2191,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, struct mmu_notifier_range range; swp_entry_t entry; pte_t swp_pte; + pte_t ptent; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, @@ -2198,18 +2203,19 @@ static bool page_make_device_exclusive_one(struct folio *folio, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - if (!pte_present(*pvmw.pte)) { + ptent = ptep_get(pvmw.pte); + if (!pte_present(ptent)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pte_pfn(ptent) - folio_pfn(folio)); address = pvmw.address; /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pte_pfn(ptent)); pteval = ptep_clear_flush(vma, address, pvmw.pte); /* Set the dirty flag on the folio now the pte is gone. */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 10d73a0dfcec..a044a130405b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -133,7 +133,7 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size, void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { - unsigned long pfn = pte_pfn(*pte); + unsigned long pfn = pte_pfn(ptep_get(pte)); int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) @@ -146,7 +146,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct page *reuse) { pte_t *pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) { + if (pte_none(ptep_get(pte))) { pte_t entry; void *p; @@ -414,7 +414,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, * with just tail struct pages. */ return vmemmap_populate_range(start, end, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); } size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); @@ -438,7 +438,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); if (rc) return -ENOMEM; } diff --git a/mm/swap_state.c b/mm/swap_state.c index a33c60e0158f..4a5c7b748051 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -275,9 +275,9 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, } } -/* - * If we are the only user, then try to free up the swap cache. - * +/* + * If we are the only user, then try to free up the swap cache. + * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. @@ -294,7 +294,7 @@ void free_swap_cache(struct page *page) } } -/* +/* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 74dd4d2337b7..a6945c2e0d03 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1745,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; - pte_t *pte, new_pte; + pte_t *pte, new_pte, old_pte; bool hwposioned = false; int ret = 1; @@ -1757,11 +1757,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte || !pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), + swp_entry_to_pte(entry)))) { ret = 0; goto out; } + old_pte = ptep_get(pte); + if (unlikely(hwposioned || !PageUptodate(page))) { swp_entry_t swp_entry; @@ -1793,7 +1796,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * call and have the page locked. */ VM_BUG_ON_PAGE(PageWriteback(page), page); - if (pte_swp_exclusive(*pte)) + if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; page_add_anon_rmap(page, vma, addr, rmap_flags); @@ -1802,9 +1805,9 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pte)) + if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); - if (pte_swp_uffd_wp(*pte)) + if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); @@ -1833,6 +1836,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned char swp_count; swp_entry_t entry; int ret; + pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); @@ -1840,10 +1844,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, break; } - if (!is_swap_pte(*pte)) + ptent = ptep_get_lockless(pte); + + if (!is_swap_pte(ptent)) continue; - entry = pte_to_swp_entry(*pte); + entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5fd787158c70..a2bf37ee276d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -97,7 +97,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ - if (!pte_none_mostly(*dst_pte)) + if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock; folio = page_folio(page); @@ -230,7 +230,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, goto out_unlock; } ret = -EEXIST; - if (!pte_none(*dst_pte)) + if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7382e0a60ce1..5a3bf408251b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -103,7 +103,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); @@ -472,7 +472,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, do { struct page *page = pages[*nr]; - if (WARN_ON(!pte_none(*pte))) + if (WARN_ON(!pte_none(ptep_get(pte)))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; @@ -704,7 +704,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) return NULL; ptep = pte_offset_kernel(pmd, addr); - pte = *ptep; + pte = ptep_get(ptep); if (pte_present(pte)) page = pte_page(pte); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3f64c8d9f629..e305c11ec8fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4037,15 +4037,16 @@ restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; struct folio *folio; + pte_t ptent = ptep_get(pte + i); total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(pte[i], args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) { + if (!pte_young(ptent)) { walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -4060,7 +4061,7 @@ restart: young++; walk->mm_stats[MM_LEAF_YOUNG]++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); @@ -4703,12 +4704,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; + pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + pfn = get_pte_pfn(ptent, pvmw->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) + if (!pte_young(ptent)) continue; folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); @@ -4720,7 +4722,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) young++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 51e4882d0873..fb37adecfc91 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2578,6 +2578,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, { kvm_pfn_t pfn; pte_t *ptep; + pte_t pte; spinlock_t *ptl; int r; @@ -2601,14 +2602,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, return r; } - if (write_fault && !pte_write(*ptep)) { + pte = ptep_get(ptep); + + if (write_fault && !pte_write(pte)) { pfn = KVM_PFN_ERR_RO_FAULT; goto out; } if (writable) - *writable = pte_write(*ptep); - pfn = pte_pfn(*ptep); + *writable = pte_write(pte); + pfn = pte_pfn(pte); /* * Get a reference here because callers of *hva_to_pfn* and @@ -2626,7 +2629,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * tail pages of non-compound higher order allocations, which * would then underflow the refcount when the caller does the * required put_page. Don't allow those pages here. - */ + */ if (!kvm_try_get_pfn(pfn)) r = -EFAULT; -- cgit v1.2.3 From cb0551adf92dac140d0e0958b4aaa860348bedbb Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 14 Jun 2023 23:13:12 +0900 Subject: zram: further limit recompression threshold Recompression threshold should be below huge-size-class watermark. Any object larger than huge-size-class is a "huge object" and occupies a whole physical page on the zsmalloc side, in other words it's incompressible, as far as zsmalloc is concerned. Link: https://lkml.kernel.org/r/20230614141338.3480029-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Brian Geffon Acked-by: Brian Geffon Cc: Jens Axboe Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f6d90f1ba5cf..5d258a28ec43 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1753,7 +1753,7 @@ static ssize_t recompress_store(struct device *dev, } } - if (threshold >= PAGE_SIZE) + if (threshold >= huge_class_size) return -EINVAL; down_read(&zram->init_lock); -- cgit v1.2.3 From 0b62af28f249b9c4036a05acfb053058dc02e2e2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Jun 2023 17:45:48 +0100 Subject: i915: convert shmem_sg_free_table() to use a folio_batch Remove a few hidden compound_head() calls by converting the returned page to a folio once and using the folio APIs. We also only increment the refcount on the folio once instead of once for each page. Ideally, we would have a for_each_sgt_folio macro, but until then this will do. Link: https://lkml.kernel.org/r/20230621164557.3510324-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 55 +++++++++++++++++-------------- 1 file changed, 31 insertions(+), 24 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 37d1efcd3ca6..adf1154c0e10 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -19,13 +19,13 @@ #include "i915_trace.h" /* - * Move pages to appropriate lru and release the pagevec, decrementing the - * ref count of those pages. + * Move folios to appropriate lru and release the batch, decrementing the + * ref count of those folios. */ -static void check_release_pagevec(struct pagevec *pvec) +static void check_release_folio_batch(struct folio_batch *fbatch) { - check_move_unevictable_pages(pvec); - __pagevec_release(pvec); + check_move_unevictable_folios(fbatch); + __folio_batch_release(fbatch); cond_resched(); } @@ -33,24 +33,29 @@ void shmem_sg_free_table(struct sg_table *st, struct address_space *mapping, bool dirty, bool backup) { struct sgt_iter sgt_iter; - struct pagevec pvec; + struct folio_batch fbatch; + struct folio *last = NULL; struct page *page; mapping_clear_unevictable(mapping); - pagevec_init(&pvec); + folio_batch_init(&fbatch); for_each_sgt_page(page, sgt_iter, st) { - if (dirty) - set_page_dirty(page); + struct folio *folio = page_folio(page); + if (folio == last) + continue; + last = folio; + if (dirty) + folio_mark_dirty(folio); if (backup) - mark_page_accessed(page); + folio_mark_accessed(folio); - if (!pagevec_add(&pvec, page)) - check_release_pagevec(&pvec); + if (!folio_batch_add(&fbatch, folio)) + check_release_folio_batch(&fbatch); } - if (pagevec_count(&pvec)) - check_release_pagevec(&pvec); + if (fbatch.nr) + check_release_folio_batch(&fbatch); sg_free_table(st); } @@ -63,8 +68,7 @@ int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, unsigned int page_count; /* restricted by sg_alloc_table */ unsigned long i; struct scatterlist *sg; - struct page *page; - unsigned long last_pfn = 0; /* suppress gcc warning */ + unsigned long next_pfn = 0; /* suppress gcc warning */ gfp_t noreclaim; int ret; @@ -95,6 +99,7 @@ int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, sg = st->sgl; st->nents = 0; for (i = 0; i < page_count; i++) { + struct folio *folio; const unsigned int shrink[] = { I915_SHRINK_BOUND | I915_SHRINK_UNBOUND, 0, @@ -103,12 +108,12 @@ int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, do { cond_resched(); - page = shmem_read_mapping_page_gfp(mapping, i, gfp); - if (!IS_ERR(page)) + folio = shmem_read_folio_gfp(mapping, i, gfp); + if (!IS_ERR(folio)) break; if (!*s) { - ret = PTR_ERR(page); + ret = PTR_ERR(folio); goto err_sg; } @@ -147,19 +152,21 @@ int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, if (!i || sg->length >= max_segment || - page_to_pfn(page) != last_pfn + 1) { + folio_pfn(folio) != next_pfn) { if (i) sg = sg_next(sg); st->nents++; - sg_set_page(sg, page, PAGE_SIZE, 0); + sg_set_folio(sg, folio, folio_size(folio), 0); } else { - sg->length += PAGE_SIZE; + /* XXX: could overflow? */ + sg->length += folio_size(folio); } - last_pfn = page_to_pfn(page); + next_pfn = folio_pfn(folio) + folio_nr_pages(folio); + i += folio_nr_pages(folio) - 1; /* Check that the i965g/gm workaround works. */ - GEM_BUG_ON(gfp & __GFP_DMA32 && last_pfn >= 0x00100000UL); + GEM_BUG_ON(gfp & __GFP_DMA32 && next_pfn >= 0x00100000UL); } if (sg) /* loop terminated early; short sg table */ sg_mark_end(sg); -- cgit v1.2.3 From 3291e09a463870610b8227f32b16b19a587edf33 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Jun 2023 17:45:49 +0100 Subject: drm: convert drm_gem_put_pages() to use a folio_batch Remove a few hidden compound_head() calls by converting the returned page to a folio once and using the folio APIs. Link: https://lkml.kernel.org/r/20230621164557.3510324-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_gem.c | 68 +++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 29 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 1a5a2cd0d4ec..78dcae201cc6 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -496,13 +496,13 @@ int drm_gem_create_mmap_offset(struct drm_gem_object *obj) EXPORT_SYMBOL(drm_gem_create_mmap_offset); /* - * Move pages to appropriate lru and release the pagevec, decrementing the - * ref count of those pages. + * Move folios to appropriate lru and release the folios, decrementing the + * ref count of those folios. */ -static void drm_gem_check_release_pagevec(struct pagevec *pvec) +static void drm_gem_check_release_batch(struct folio_batch *fbatch) { - check_move_unevictable_pages(pvec); - __pagevec_release(pvec); + check_move_unevictable_folios(fbatch); + __folio_batch_release(fbatch); cond_resched(); } @@ -534,10 +534,10 @@ static void drm_gem_check_release_pagevec(struct pagevec *pvec) struct page **drm_gem_get_pages(struct drm_gem_object *obj) { struct address_space *mapping; - struct page *p, **pages; - struct pagevec pvec; - int i, npages; - + struct page **pages; + struct folio *folio; + struct folio_batch fbatch; + int i, j, npages; if (WARN_ON(!obj->filp)) return ERR_PTR(-EINVAL); @@ -559,11 +559,14 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) mapping_set_unevictable(mapping); - for (i = 0; i < npages; i++) { - p = shmem_read_mapping_page(mapping, i); - if (IS_ERR(p)) + i = 0; + while (i < npages) { + folio = shmem_read_folio_gfp(mapping, i, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) goto fail; - pages[i] = p; + for (j = 0; j < folio_nr_pages(folio); j++, i++) + pages[i] = folio_file_page(folio, i); /* Make sure shmem keeps __GFP_DMA32 allocated pages in the * correct region during swapin. Note that this requires @@ -571,23 +574,26 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) * so shmem can relocate pages during swapin if required. */ BUG_ON(mapping_gfp_constraint(mapping, __GFP_DMA32) && - (page_to_pfn(p) >= 0x00100000UL)); + (folio_pfn(folio) >= 0x00100000UL)); } return pages; fail: mapping_clear_unevictable(mapping); - pagevec_init(&pvec); - while (i--) { - if (!pagevec_add(&pvec, pages[i])) - drm_gem_check_release_pagevec(&pvec); + folio_batch_init(&fbatch); + j = 0; + while (j < i) { + struct folio *f = page_folio(pages[j]); + if (!folio_batch_add(&fbatch, f)) + drm_gem_check_release_batch(&fbatch); + j += folio_nr_pages(f); } - if (pagevec_count(&pvec)) - drm_gem_check_release_pagevec(&pvec); + if (fbatch.nr) + drm_gem_check_release_batch(&fbatch); kvfree(pages); - return ERR_CAST(p); + return ERR_CAST(folio); } EXPORT_SYMBOL(drm_gem_get_pages); @@ -603,7 +609,7 @@ void drm_gem_put_pages(struct drm_gem_object *obj, struct page **pages, { int i, npages; struct address_space *mapping; - struct pagevec pvec; + struct folio_batch fbatch; mapping = file_inode(obj->filp)->i_mapping; mapping_clear_unevictable(mapping); @@ -616,23 +622,27 @@ void drm_gem_put_pages(struct drm_gem_object *obj, struct page **pages, npages = obj->size >> PAGE_SHIFT; - pagevec_init(&pvec); + folio_batch_init(&fbatch); for (i = 0; i < npages; i++) { + struct folio *folio; + if (!pages[i]) continue; + folio = page_folio(pages[i]); if (dirty) - set_page_dirty(pages[i]); + folio_mark_dirty(folio); if (accessed) - mark_page_accessed(pages[i]); + folio_mark_accessed(folio); /* Undo the reference we took when populating the table */ - if (!pagevec_add(&pvec, pages[i])) - drm_gem_check_release_pagevec(&pvec); + if (!folio_batch_add(&fbatch, folio)) + drm_gem_check_release_batch(&fbatch); + i += folio_nr_pages(folio) - 1; } - if (pagevec_count(&pvec)) - drm_gem_check_release_pagevec(&pvec); + if (folio_batch_count(&fbatch)) + drm_gem_check_release_batch(&fbatch); kvfree(pages); } -- cgit v1.2.3 From f8a101ff09a70ec708b66b3f5bd4e7405283d14a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Jun 2023 17:45:52 +0100 Subject: i915: convert i915_gpu_error to use a folio_batch Remove one of the last remaining users of pagevec. Link: https://lkml.kernel.org/r/20230621164557.3510324-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/i915_gpu_error.c | 50 +++++++++++++++++------------------ 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index f020c0086fbc..35f70bb8e4fb 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -187,64 +187,64 @@ i915_error_printer(struct drm_i915_error_state_buf *e) } /* single threaded page allocator with a reserved stash for emergencies */ -static void pool_fini(struct pagevec *pv) +static void pool_fini(struct folio_batch *fbatch) { - pagevec_release(pv); + folio_batch_release(fbatch); } -static int pool_refill(struct pagevec *pv, gfp_t gfp) +static int pool_refill(struct folio_batch *fbatch, gfp_t gfp) { - while (pagevec_space(pv)) { - struct page *p; + while (folio_batch_space(fbatch)) { + struct folio *folio; - p = alloc_page(gfp); - if (!p) + folio = folio_alloc(gfp, 0); + if (!folio) return -ENOMEM; - pagevec_add(pv, p); + folio_batch_add(fbatch, folio); } return 0; } -static int pool_init(struct pagevec *pv, gfp_t gfp) +static int pool_init(struct folio_batch *fbatch, gfp_t gfp) { int err; - pagevec_init(pv); + folio_batch_init(fbatch); - err = pool_refill(pv, gfp); + err = pool_refill(fbatch, gfp); if (err) - pool_fini(pv); + pool_fini(fbatch); return err; } -static void *pool_alloc(struct pagevec *pv, gfp_t gfp) +static void *pool_alloc(struct folio_batch *fbatch, gfp_t gfp) { - struct page *p; + struct folio *folio; - p = alloc_page(gfp); - if (!p && pagevec_count(pv)) - p = pv->pages[--pv->nr]; + folio = folio_alloc(gfp, 0); + if (!folio && folio_batch_count(fbatch)) + folio = fbatch->folios[--fbatch->nr]; - return p ? page_address(p) : NULL; + return folio ? folio_address(folio) : NULL; } -static void pool_free(struct pagevec *pv, void *addr) +static void pool_free(struct folio_batch *fbatch, void *addr) { - struct page *p = virt_to_page(addr); + struct folio *folio = virt_to_folio(addr); - if (pagevec_space(pv)) - pagevec_add(pv, p); + if (folio_batch_space(fbatch)) + folio_batch_add(fbatch, folio); else - __free_page(p); + folio_put(folio); } #ifdef CONFIG_DRM_I915_COMPRESS_ERROR struct i915_vma_compress { - struct pagevec pool; + struct folio_batch pool; struct z_stream_s zstream; void *tmp; }; @@ -381,7 +381,7 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m) #else struct i915_vma_compress { - struct pagevec pool; + struct folio_batch pool; }; static bool compress_init(struct i915_vma_compress *c) -- cgit v1.2.3