diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-14 06:29:45 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-14 06:29:45 +0300 |
commit | e2ca6ba6ba0152361aa4fcbf6067db71b2c7a770 (patch) | |
tree | f7ed7753a2e66486a4ffe0fbbf98404ec4ba2212 /fs/dax.c | |
parent | 7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91 (diff) | |
parent | c45bc55a99957b20e4e0333bcd42e12d1833a7f5 (diff) | |
download | linux-e2ca6ba6ba0152361aa4fcbf6067db71b2c7a770.tar.xz |
Merge tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- More userfaultfs work from Peter Xu
- Several convert-to-folios series from Sidhartha Kumar and Huang Ying
- Some filemap cleanups from Vishal Moola
- David Hildenbrand added the ability to selftest anon memory COW
handling
- Some cpuset simplifications from Liu Shixin
- Addition of vmalloc tracing support by Uladzislau Rezki
- Some pagecache folioifications and simplifications from Matthew
Wilcox
- A pagemap cleanup from Kefeng Wang: we have VM_ACCESS_FLAGS, so use
it
- Miguel Ojeda contributed some cleanups for our use of the
__no_sanitize_thread__ gcc keyword.
This series should have been in the non-MM tree, my bad
- Naoya Horiguchi improved the interaction between memory poisoning and
memory section removal for huge pages
- DAMON cleanups and tuneups from SeongJae Park
- Tony Luck fixed the handling of COW faults against poisoned pages
- Peter Xu utilized the PTE marker code for handling swapin errors
- Hugh Dickins reworked compound page mapcount handling, simplifying it
and making it more efficient
- Removal of the autonuma savedwrite infrastructure from Nadav Amit and
David Hildenbrand
- zram support for multiple compression streams from Sergey Senozhatsky
- David Hildenbrand reworked the GUP code's R/O long-term pinning so
that drivers no longer need to use the FOLL_FORCE workaround which
didn't work very well anyway
- Mel Gorman altered the page allocator so that local IRQs can remnain
enabled during per-cpu page allocations
- Vishal Moola removed the try_to_release_page() wrapper
- Stefan Roesch added some per-BDI sysfs tunables which are used to
prevent network block devices from dirtying excessive amounts of
pagecache
- David Hildenbrand did some cleanup and repair work on KSM COW
breaking
- Nhat Pham and Johannes Weiner have implemented writeback in zswap's
zsmalloc backend
- Brian Foster has fixed a longstanding corner-case oddity in
file[map]_write_and_wait_range()
- sparse-vmemmap changes for MIPS, LoongArch and NIOS2 from Feiyang
Chen
- Shiyang Ruan has done some work on fsdax, to make its reflink mode
work better under xfstests. Better, but still not perfect
- Christoph Hellwig has removed the .writepage() method from several
filesystems. They only need .writepages()
- Yosry Ahmed wrote a series which fixes the memcg reclaim target
beancounting
- David Hildenbrand has fixed some of our MM selftests for 32-bit
machines
- Many singleton patches, as usual
* tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (313 commits)
mm/hugetlb: set head flag before setting compound_order in __prep_compound_gigantic_folio
mm: mmu_gather: allow more than one batch of delayed rmaps
mm: fix typo in struct pglist_data code comment
kmsan: fix memcpy tests
mm: add cond_resched() in swapin_walk_pmd_entry()
mm: do not show fs mm pc for VM_LOCKONFAULT pages
selftests/vm: ksm_functional_tests: fixes for 32bit
selftests/vm: cow: fix compile warning on 32bit
selftests/vm: madv_populate: fix missing MADV_POPULATE_(READ|WRITE) definitions
mm/gup_test: fix PIN_LONGTERM_TEST_READ with highmem
mm,thp,rmap: fix races between updates of subpages_mapcount
mm: memcg: fix swapcached stat accounting
mm: add nodes= arg to memory.reclaim
mm: disable top-tier fallback to reclaim on proactive reclaim
selftests: cgroup: make sure reclaim target memcg is unprotected
selftests: cgroup: refactor proactive reclaim code to reclaim_until()
mm: memcg: fix stale protection of reclaim target memcg
mm/mmap: properly unaccount memory on mas_preallocate() failure
omfs: remove ->writepage
jfs: remove ->writepage
...
Diffstat (limited to 'fs/dax.c')
-rw-r--r-- | fs/dax.c | 221 |
1 files changed, 151 insertions, 70 deletions
@@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) -static inline bool dax_mapping_is_cow(struct address_space *mapping) +static inline bool dax_page_is_shared(struct page *page) { - return (unsigned long)mapping == PAGE_MAPPING_DAX_COW; + return page->mapping == PAGE_MAPPING_DAX_SHARED; } /* - * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount. + * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the + * refcount. */ -static inline void dax_mapping_set_cow(struct page *page) +static inline void dax_page_share_get(struct page *page) { - if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) { + if (page->mapping != PAGE_MAPPING_DAX_SHARED) { /* * Reset the index if the page was already mapped * regularly before. */ if (page->mapping) - page->index = 1; - page->mapping = (void *)PAGE_MAPPING_DAX_COW; + page->share = 1; + page->mapping = PAGE_MAPPING_DAX_SHARED; } - page->index++; + page->share++; +} + +static inline unsigned long dax_page_share_put(struct page *page) +{ + return --page->share; } /* - * When it is called in dax_insert_entry(), the cow flag will indicate that + * When it is called in dax_insert_entry(), the shared flag will indicate that * whether this entry is shared by multiple files. If so, set the page->mapping - * FS_DAX_MAPPING_COW, and use page->index as refcount. + * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address, bool cow) + struct vm_area_struct *vma, unsigned long address, bool shared) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; @@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - if (cow) { - dax_mapping_set_cow(page); + if (shared) { + dax_page_share_get(page); } else { WARN_ON_ONCE(page->mapping); page->mapping = mapping; @@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - if (dax_mapping_is_cow(page->mapping)) { - /* keep the CoW flag if this page is still shared */ - if (page->index-- > 0) + if (dax_page_is_shared(page)) { + /* keep the shared flag if this page is still shared */ + if (dax_page_share_put(page) > 0) continue; } else WARN_ON_ONCE(page->mapping && page->mapping != mapping); @@ -840,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, (iter->iomap.flags & IOMAP_F_DIRTY); } -static bool dax_fault_is_cow(const struct iomap_iter *iter) -{ - return (iter->flags & IOMAP_WRITE) && - (iter->iomap.flags & IOMAP_F_SHARED); -} - /* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to @@ -859,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); - bool dirty = !dax_fault_is_synchronous(iter, vmf->vma); - bool cow = dax_fault_is_cow(iter); + bool write = iter->flags & IOMAP_WRITE; + bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); + bool shared = iter->iomap.flags & IOMAP_F_SHARED; if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { + if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) @@ -877,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, xas_reset(xas); xas_lock_irq(xas); - if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, - cow); + shared); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -902,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, if (dirty) xas_set_mark(xas, PAGECACHE_TAG_DIRTY); - if (cow) + if (write && shared) xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irq(xas); @@ -1086,7 +1087,8 @@ out: } /** - * dax_iomap_cow_copy - Copy the data from source to destination before write + * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page + * by copying the data before and after the range to be written. * @pos: address to do copy from. * @length: size of copy operation. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) @@ -1095,35 +1097,50 @@ out: * * This can be called from two places. Either during DAX write fault (page * aligned), to copy the length size data to daddr. Or, while doing normal DAX - * write operation, dax_iomap_actor() might call this to do the copy of either + * write operation, dax_iomap_iter() might call this to do the copy of either * start or end unaligned address. In the latter case the rest of the copy of - * aligned ranges is taken care by dax_iomap_actor() itself. + * aligned ranges is taken care by dax_iomap_iter() itself. + * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the + * area to make sure no old data remains. */ -static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, +static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, const struct iomap *srcmap, void *daddr) { loff_t head_off = pos & (align_size - 1); size_t size = ALIGN(head_off + length, align_size); loff_t end = pos + length; loff_t pg_end = round_up(end, align_size); + /* copy_all is usually in page fault case */ bool copy_all = head_off == 0 && end == pg_end; + /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ + bool zero_edge = srcmap->flags & IOMAP_F_SHARED || + srcmap->type == IOMAP_UNWRITTEN; void *saddr = 0; int ret = 0; - ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); - if (ret) - return ret; + if (!zero_edge) { + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); + if (ret) + return ret; + } if (copy_all) { - ret = copy_mc_to_kernel(daddr, saddr, length); - return ret ? -EIO : 0; + if (zero_edge) + memset(daddr, 0, size); + else + ret = copy_mc_to_kernel(daddr, saddr, length); + goto out; } /* Copy the head part of the range */ if (head_off) { - ret = copy_mc_to_kernel(daddr, saddr, head_off); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr, 0, head_off); + else { + ret = copy_mc_to_kernel(daddr, saddr, head_off); + if (ret) + return -EIO; + } } /* Copy the tail part of the range */ @@ -1131,12 +1148,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, loff_t tail_off = head_off + length; loff_t tail_len = pg_end - end; - ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off, - tail_len); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr + tail_off, 0, tail_len); + else { + ret = copy_mc_to_kernel(daddr + tail_off, + saddr + tail_off, tail_len); + if (ret) + return -EIO; + } } - return 0; +out: + if (zero_edge) + dax_flush(srcmap->dax_dev, daddr, size); + return ret ? -EIO : 0; } /* @@ -1221,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ +static s64 dax_unshare_iter(struct iomap_iter *iter) +{ + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); + int id = 0; + s64 ret = 0; + void *daddr = NULL, *saddr = NULL; + + /* don't bother with blocks that are not shared to start with */ + if (!(iomap->flags & IOMAP_F_SHARED)) + return length; + /* don't bother with holes or unwritten extents */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; + + id = dax_read_lock(); + ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = copy_mc_to_kernel(daddr, saddr, length); + if (ret) + ret = -EIO; + +out_unlock: + dax_read_unlock(id); + return ret; +} + +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops) +{ + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = dax_unshare_iter(&iter); + return ret; +} +EXPORT_SYMBOL_GPL(dax_file_unshare); + static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { const struct iomap *iomap = &iter->iomap; @@ -1235,13 +1311,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) if (ret < 0) return ret; memset(kaddr + offset, 0, size); - if (srcmap->addr != iomap->addr) { - ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap, - kaddr); - if (ret < 0) - return ret; - dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE); - } else + if (iomap->flags & IOMAP_F_SHARED) + ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, + kaddr); + else dax_flush(iomap->dax_dev, kaddr + offset, size); return ret; } @@ -1258,6 +1331,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) return length; + /* + * invalidate the pages whose sharing state is to be changed + * because of CoW. + */ + if (iomap->flags & IOMAP_F_SHARED) + invalidate_inode_pages2_range(iter->inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + length - 1) >> PAGE_SHIFT); + do { unsigned offset = offset_in_page(pos); unsigned size = min_t(u64, PAGE_SIZE - offset, length); @@ -1318,12 +1400,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; - const struct iomap *srcmap = &iomi->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iomi); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; bool write = iov_iter_rw(iter) == WRITE; + bool cow = write && iomap->flags & IOMAP_F_SHARED; ssize_t ret = 0; size_t xfer; int id; @@ -1350,7 +1433,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if (iomap->flags & IOMAP_F_NEW) { + if (iomap->flags & IOMAP_F_NEW || cow) { invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); @@ -1384,10 +1467,9 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, break; } - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { - ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, - kaddr); + if (cow) { + ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, + srcmap, kaddr); if (ret) break; } @@ -1532,7 +1614,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, struct xa_state *xas, void **entry, bool pmd) { const struct iomap *iomap = &iter->iomap; - const struct iomap *srcmap = &iter->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = iter->flags & IOMAP_WRITE; @@ -1563,9 +1645,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { - err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr); + if (write && iomap->flags & IOMAP_F_SHARED) { + err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); if (err) return dax_fault_return(err); } @@ -1936,15 +2017,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret; + int ret, compared = 0; - while ((ret = iomap_iter(&src_iter, ops)) > 0) { - while ((ret = iomap_iter(&dst_iter, ops)) > 0) { - dst_iter.processed = dax_range_compare_iter(&src_iter, - &dst_iter, len, same); - } - if (ret <= 0) - src_iter.processed = ret; + while ((ret = iomap_iter(&src_iter, ops)) > 0 && + (ret = iomap_iter(&dst_iter, ops)) > 0) { + compared = dax_range_compare_iter(&src_iter, &dst_iter, len, + same); + if (compared < 0) + return ret; + src_iter.processed = dst_iter.processed = compared; } return ret; } |