From 3fe0791c295cfd3cd735de7a32cc0780949c009f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 14 Oct 2017 17:13:45 -0700 Subject: dax: store pfns in the radix In preparation for examining the busy state of dax pages in the truncate path, switch from sectors to pfns in the radix. Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Ross Zwisler Reviewed-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 83 ++++++++++++++++++++++++---------------------------------------- 1 file changed, 31 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 0276df90e86c..b646a46e4d12 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -73,16 +73,15 @@ fs_initcall(init_dax_wait_table); #define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) -static unsigned long dax_radix_sector(void *entry) +static unsigned long dax_radix_pfn(void *entry) { return (unsigned long)entry >> RADIX_DAX_SHIFT; } -static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) +static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) { return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | - ((unsigned long)sector << RADIX_DAX_SHIFT) | - RADIX_DAX_ENTRY_LOCK); + (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); } static unsigned int dax_radix_order(void *entry) @@ -526,12 +525,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, */ static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, - void *entry, sector_t sector, + void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; - void *new_entry; + unsigned long pfn = pfn_t_to_pfn(pfn_t); pgoff_t index = vmf->pgoff; + void *new_entry; if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -546,7 +546,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, } spin_lock_irq(&mapping->tree_lock); - new_entry = dax_radix_locked_entry(sector, flags); + new_entry = dax_radix_locked_entry(pfn, flags); if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* @@ -657,17 +657,14 @@ unlock_pte: i_mmap_unlock_read(mapping); } -static int dax_writeback_one(struct block_device *bdev, - struct dax_device *dax_dev, struct address_space *mapping, - pgoff_t index, void *entry) +static int dax_writeback_one(struct dax_device *dax_dev, + struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - void *entry2, **slot, *kaddr; - long ret = 0, id; - sector_t sector; - pgoff_t pgoff; + void *entry2, **slot; + unsigned long pfn; + long ret = 0; size_t size; - pfn_t pfn; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -683,10 +680,10 @@ static int dax_writeback_one(struct block_device *bdev, goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. We have to - * compare sectors as we must not bail out due to difference in lockbit + * compare pfns as we must not bail out due to difference in lockbit * or entry type. */ - if (dax_radix_sector(entry2) != dax_radix_sector(entry)) + if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { @@ -712,33 +709,15 @@ static int dax_writeback_one(struct block_device *bdev, /* * Even if dax_writeback_mapping_range() was given a wbc->range_start * in the middle of a PMD, the 'index' we are given will be aligned to - * the start index of the PMD, as will the sector we pull from - * 'entry'. This allows us to flush for PMD_SIZE and not have to - * worry about partial PMD writebacks. + * the start index of the PMD, as will the pfn we pull from 'entry'. + * This allows us to flush for PMD_SIZE and not have to worry about + * partial PMD writebacks. */ - sector = dax_radix_sector(entry); + pfn = dax_radix_pfn(entry); size = PAGE_SIZE << dax_radix_order(entry); - id = dax_read_lock(); - ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); - if (ret) - goto dax_unlock; - - /* - * dax_direct_access() may sleep, so cannot hold tree_lock over - * its invocation. - */ - ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); - if (ret < 0) - goto dax_unlock; - - if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { - ret = -EIO; - goto dax_unlock; - } - - dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); - dax_flush(dax_dev, kaddr, size); + dax_mapping_entry_mkclean(mapping, index, pfn); + dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -749,8 +728,6 @@ static int dax_writeback_one(struct block_device *bdev, radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); - dax_unlock: - dax_read_unlock(id); put_locked_mapping_entry(mapping, index); return ret; @@ -808,8 +785,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, break; } - ret = dax_writeback_one(bdev, dax_dev, mapping, - indices[i], pvec.pages[i]); + ret = dax_writeback_one(dax_dev, mapping, indices[i], + pvec.pages[i]); if (ret < 0) { mapping_set_error(mapping, ret); goto out; @@ -877,6 +854,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, int ret = VM_FAULT_NOPAGE; struct page *zero_page; void *entry2; + pfn_t pfn; zero_page = ZERO_PAGE(0); if (unlikely(!zero_page)) { @@ -884,14 +862,15 @@ static int dax_load_hole(struct address_space *mapping, void *entry, goto out; } - entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, + pfn = page_to_pfn_t(zero_page); + entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(entry2)) { ret = VM_FAULT_SIGBUS; goto out; } - vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); + vm_insert_mixed(vmf->vma, vaddr, pfn); out: trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1200,8 +1179,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto error_finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, - dax_iomap_sector(&iomap, pos), + entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, 0, write && !sync); if (IS_ERR(entry)) { error = PTR_ERR(entry); @@ -1280,13 +1258,15 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; + pfn_t pfn; zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); if (unlikely(!zero_page)) goto fallback; - ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, + pfn = page_to_pfn_t(zero_page); + ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(ret)) goto fallback; @@ -1409,8 +1389,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, - dax_iomap_sector(&iomap, pos), + entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD, write && !sync); if (IS_ERR(entry)) goto finish_iomap; -- cgit v1.2.3 From f44c77630d26ca2c2a60b20c47dd9ce07c4361b3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Mar 2018 15:26:44 -0800 Subject: fs, dax: prepare for dax-specific address_space_operations In preparation for the dax implementation to start associating dax pages to inodes via page->mapping, we need to provide a 'struct address_space_operations' instance for dax. Define some generic VFS aops helpers for dax. These noop implementations are there in the dax case to prevent the VFS from falling back to operations with page-cache assumptions, dax_writeback_mapping_range() may not be referenced in the FS_DAX=n case. Cc: Jeff Moyer Cc: Ross Zwisler Suggested-by: Matthew Wilcox Suggested-by: Jan Kara Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Suggested-by: Dave Chinner Signed-off-by: Dan Williams --- fs/libfs.c | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/dax.h | 12 +++++++++--- include/linux/fs.h | 4 ++++ 3 files changed, 52 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 7ff3cb904acd..0fb590d79f30 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1060,6 +1060,45 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); +int noop_set_page_dirty(struct page *page) +{ + /* + * Unlike __set_page_dirty_no_writeback that handles dirty page + * tracking in the page object, dax does all dirty tracking in + * the inode address_space in response to mkwrite faults. In the + * dax case we only need to worry about potentially dirty CPU + * caches, not dirty page cache pages to write back. + * + * This callback is defined to prevent fallback to + * __set_page_dirty_buffers() in set_page_dirty(). + */ + return 0; +} +EXPORT_SYMBOL_GPL(noop_set_page_dirty); + +void noop_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + /* + * There is no page cache to invalidate in the dax case, however + * we need this callback defined to prevent falling back to + * block_invalidatepage() in do_invalidatepage(). + */ +} +EXPORT_SYMBOL_GPL(noop_invalidatepage); + +ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + /* + * iomap based filesystems support direct I/O without need for + * this callback. However, it still needs to be set in + * inode->a_ops so that open/fcntl know that direct I/O is + * generally supported. + */ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(noop_direct_IO); + /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ void kfree_link(void *p) { diff --git a/include/linux/dax.h b/include/linux/dax.h index 0185ecdae135..ae27a7efe7ab 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -38,6 +38,7 @@ static inline void put_dax(struct dax_device *dax_dev) } #endif +struct writeback_control; int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) int __bdev_dax_supported(struct super_block *sb, int blocksize); @@ -57,6 +58,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev) } struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); +int dax_writeback_mapping_range(struct address_space *mapping, + struct block_device *bdev, struct writeback_control *wbc); #else static inline int bdev_dax_supported(struct super_block *sb, int blocksize) { @@ -76,6 +79,12 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) { return NULL; } + +static inline int dax_writeback_mapping_range(struct address_space *mapping, + struct block_device *bdev, struct writeback_control *wbc) +{ + return -EOPNOTSUPP; +} #endif int dax_read_lock(void); @@ -121,7 +130,4 @@ static inline bool dax_mapping(struct address_space *mapping) return mapping->host && IS_DAX(mapping->host); } -struct writeback_control; -int dax_writeback_mapping_range(struct address_space *mapping, - struct block_device *bdev, struct writeback_control *wbc); #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 79c413985305..44f7f7080faa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3129,6 +3129,10 @@ extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern int noop_fsync(struct file *, loff_t, loff_t, int); +extern int noop_set_page_dirty(struct page *page); +extern void noop_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); +extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, -- cgit v1.2.3 From 15aa8a01189b7399d085cceae6775cc6899b1909 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 11 Mar 2018 12:03:08 -0700 Subject: block, dax: remove dead code in blkdev_writepages() Block device inodes never have S_DAX set, so kill the check for DAX and diversion to dax_writeback_mapping_range(). Cc: Jeff Moyer Cc: Ross Zwisler Cc: Matthew Wilcox Cc: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Dan Williams --- fs/block_dev.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index fe09ef9c21f3..846ee2d31781 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1946,11 +1946,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { - if (dax_mapping(mapping)) { - struct block_device *bdev = I_BDEV(mapping->host); - - return dax_writeback_mapping_range(mapping, bdev, wbc); - } return generic_writepages(mapping, wbc); } -- cgit v1.2.3 From 6e2608dfd93464bb26ba868b301ad5336c8c1df8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Mar 2018 15:26:44 -0800 Subject: xfs, dax: introduce xfs_dax_aops In preparation for the dax implementation to start associating dax pages to inodes via page->mapping, we need to provide a 'struct address_space_operations' instance for dax. Otherwise, direct-I/O triggers incorrect page cache assumptions and warnings like the following: WARNING: CPU: 27 PID: 1783 at fs/xfs/xfs_aops.c:1468 xfs_vm_set_page_dirty+0xf3/0x1b0 [xfs] [..] CPU: 27 PID: 1783 Comm: dma-collision Tainted: G O 4.15.0-rc2+ #984 [..] Call Trace: set_page_dirty_lock+0x40/0x60 bio_set_pages_dirty+0x37/0x50 iomap_dio_actor+0x2b7/0x3b0 ? iomap_dio_zero+0x110/0x110 iomap_apply+0xa4/0x110 iomap_dio_rw+0x29e/0x3b0 ? iomap_dio_zero+0x110/0x110 ? xfs_file_dio_aio_read+0x7c/0x1a0 [xfs] xfs_file_dio_aio_read+0x7c/0x1a0 [xfs] xfs_file_read_iter+0xa0/0xc0 [xfs] __vfs_read+0xf9/0x170 vfs_read+0xa6/0x150 SyS_pread64+0x93/0xb0 entry_SYSCALL_64_fastpath+0x1f/0x96 ...where the default set_page_dirty() handler assumes that dirty state is being tracked in 'struct page' flags. Cc: Jeff Moyer Cc: Matthew Wilcox Cc: Ross Zwisler Suggested-by: Jan Kara Suggested-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Dan Williams --- fs/xfs/xfs_aops.c | 34 ++++++++++++++++++---------------- fs/xfs/xfs_aops.h | 1 + fs/xfs/xfs_iops.c | 5 ++++- 3 files changed, 23 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9c6a830da0ee..e7a56c4786ff 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1194,16 +1194,22 @@ xfs_vm_writepages( int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - if (dax_mapping(mapping)) - return dax_writeback_mapping_range(mapping, - xfs_find_bdev_for_inode(mapping->host), wbc); - ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); if (wpc.ioend) ret = xfs_submit_ioend(wbc, wpc.ioend, ret); return ret; } +STATIC int +xfs_dax_writepages( + struct address_space *mapping, + struct writeback_control *wbc) +{ + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + return dax_writeback_mapping_range(mapping, + xfs_find_bdev_for_inode(mapping->host), wbc); +} + /* * Called to move a page into cleanable state - and from there * to be released. The page should already be clean. We always @@ -1367,17 +1373,6 @@ out_unlock: return error; } -STATIC ssize_t -xfs_vm_direct_IO( - struct kiocb *iocb, - struct iov_iter *iter) -{ - /* - * We just need the method present so that open/fcntl allow direct I/O. - */ - return -EINVAL; -} - STATIC sector_t xfs_vm_bmap( struct address_space *mapping, @@ -1500,8 +1495,15 @@ const struct address_space_operations xfs_address_space_operations = { .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, .bmap = xfs_vm_bmap, - .direct_IO = xfs_vm_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; + +const struct address_space_operations xfs_dax_aops = { + .writepages = xfs_dax_writepages, + .direct_IO = noop_direct_IO, + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 88c85ea63da0..69346d460dfa 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -54,6 +54,7 @@ struct xfs_ioend { }; extern const struct address_space_operations xfs_address_space_operations; +extern const struct address_space_operations xfs_dax_aops; int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 56475fcd76f2..951e84df5576 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1272,7 +1272,10 @@ xfs_setup_iops( case S_IFREG: inode->i_op = &xfs_inode_operations; inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; + if (IS_DAX(inode)) + inode->i_mapping->a_ops = &xfs_dax_aops; + else + inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) -- cgit v1.2.3 From 5f0663bb4a64f588f0a2dd6d1be68d40f9af0086 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 Dec 2017 12:25:11 -0800 Subject: ext4, dax: introduce ext4_dax_aops In preparation for the dax implementation to start associating dax pages to inodes via page->mapping, we need to provide a 'struct address_space_operations' instance for dax. Otherwise, direct-I/O triggers incorrect page cache assumptions and warnings. Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: linux-ext4@vger.kernel.org Reviewed-by: Jan Kara Signed-off-by: Dan Williams --- fs/ext4/inode.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c94780075b04..249a97b19181 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2725,12 +2725,6 @@ static int ext4_writepages(struct address_space *mapping, percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - if (dax_mapping(mapping)) { - ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, - wbc); - goto out_writepages; - } - /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -2955,6 +2949,27 @@ out_writepages: return ret; } +static int ext4_dax_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret; + long nr_to_write = wbc->nr_to_write; + struct inode *inode = mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + + percpu_down_read(&sbi->s_journal_flag_rwsem); + trace_ext4_writepages(inode, wbc); + + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); + percpu_up_read(&sbi->s_journal_flag_rwsem); + return ret; +} + static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; @@ -3857,10 +3872,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (ext4_has_inline_data(inode)) return 0; - /* DAX uses iomap path now */ - if (WARN_ON_ONCE(IS_DAX(inode))) - return 0; - trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == READ) ret = ext4_direct_IO_read(iocb, iter); @@ -3946,6 +3957,13 @@ static const struct address_space_operations ext4_da_aops = { .error_remove_page = generic_error_remove_page, }; +static const struct address_space_operations ext4_dax_aops = { + .writepages = ext4_dax_writepages, + .direct_IO = noop_direct_IO, + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; + void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { @@ -3958,7 +3976,9 @@ void ext4_set_aops(struct inode *inode) default: BUG(); } - if (test_opt(inode->i_sb, DELALLOC)) + if (IS_DAX(inode)) + inode->i_mapping->a_ops = &ext4_dax_aops; + else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; -- cgit v1.2.3 From fb094c90748fbeba1063927eeb751add147b35b9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 Dec 2017 12:25:11 -0800 Subject: ext2, dax: introduce ext2_dax_aops In preparation for the dax implementation to start associating dax pages to inodes via page->mapping, we need to provide a 'struct address_space_operations' instance for dax. Otherwise, direct-I/O triggers incorrect page cache assumptions and warnings. Reviewed-by: Jan Kara Reported-by: kbuild test robot Signed-off-by: Dan Williams --- fs/ext2/ext2.h | 1 + fs/ext2/inode.c | 46 +++++++++++++++++++++++++++------------------- fs/ext2/namei.c | 18 ++---------------- 3 files changed, 30 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 032295e1d386..cc40802ddfa8 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -814,6 +814,7 @@ extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; /* inode.c */ +extern void ext2_set_file_ops(struct inode *inode); extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; extern const struct iomap_ops ext2_iomap_ops; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 9b2ac55ac34f..1e01fabef130 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -940,9 +940,6 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; - if (WARN_ON_ONCE(IS_DAX(inode))) - return -EIO; - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); @@ -952,17 +949,16 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) { -#ifdef CONFIG_FS_DAX - if (dax_mapping(mapping)) { - return dax_writeback_mapping_range(mapping, - mapping->host->i_sb->s_bdev, - wbc); - } -#endif - return mpage_writepages(mapping, wbc, ext2_get_block); } +static int +ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + return dax_writeback_mapping_range(mapping, + mapping->host->i_sb->s_bdev, wbc); +} + const struct address_space_operations ext2_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -990,6 +986,13 @@ const struct address_space_operations ext2_nobh_aops = { .error_remove_page = generic_error_remove_page, }; +static const struct address_space_operations ext2_dax_aops = { + .writepages = ext2_dax_writepages, + .direct_IO = noop_direct_IO, + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; + /* * Probably it should be a library function... search for first non-zero word * or memcmp with zero_page, whatever is better for particular architecture. @@ -1388,6 +1391,18 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_DAX; } +void ext2_set_file_ops(struct inode *inode) +{ + inode->i_op = &ext2_file_inode_operations; + inode->i_fop = &ext2_file_operations; + if (IS_DAX(inode)) + inode->i_mapping->a_ops = &ext2_dax_aops; + else if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; +} + struct inode *ext2_iget (struct super_block *sb, unsigned long ino) { struct ext2_inode_info *ei; @@ -1480,14 +1495,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) ei->i_data[n] = raw_inode->i_block[n]; if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } + ext2_set_file_ops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index e078075dc66f..55f7caadb093 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -107,14 +107,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode if (IS_ERR(inode)) return PTR_ERR(inode); - inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } + ext2_set_file_ops(inode); mark_inode_dirty(inode); return ext2_add_nondir(dentry, inode); } @@ -125,14 +118,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(inode)) return PTR_ERR(inode); - inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } + ext2_set_file_ops(inode); mark_inode_dirty(inode); d_tmpfile(dentry, inode); unlock_new_inode(inode); -- cgit v1.2.3 From d2c997c0f14535eff68d8ed9c2f1c5e100625751 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Dec 2017 22:02:48 -0800 Subject: fs, dax: use page->mapping to warn if truncate collides with a busy page Catch cases where extent unmap operations encounter pages that are pinned / busy. Typically this is pinned pages that are under active dma. This warning is a canary for potential data corruption as truncated blocks could be allocated to a new file while the device is still performing i/o. Here is an example of a collision that this implementation catches: WARNING: CPU: 2 PID: 1286 at fs/dax.c:343 dax_disassociate_entry+0x55/0x80 [..] Call Trace: __dax_invalidate_mapping_entry+0x6c/0xf0 dax_delete_mapping_entry+0xf/0x20 truncate_exceptional_pvec_entries.part.12+0x1af/0x200 truncate_inode_pages_range+0x268/0x970 ? tlb_gather_mmu+0x10/0x20 ? up_write+0x1c/0x40 ? unmap_mapping_range+0x73/0x140 xfs_free_file_space+0x1b6/0x5b0 [xfs] ? xfs_file_fallocate+0x7f/0x320 [xfs] ? down_write_nested+0x40/0x70 ? xfs_ilock+0x21d/0x2f0 [xfs] xfs_file_fallocate+0x162/0x320 [xfs] ? rcu_read_lock_sched_held+0x3f/0x70 ? rcu_sync_lockdep_assert+0x2a/0x50 ? __sb_start_write+0xd0/0x1b0 ? vfs_fallocate+0x20c/0x270 vfs_fallocate+0x154/0x270 SyS_fallocate+0x43/0x80 entry_SYSCALL_64_fastpath+0x1f/0x96 Cc: Jeff Moyer Cc: Matthew Wilcox Cc: Ross Zwisler Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- fs/dax.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index b646a46e4d12..a77394fe586e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -298,6 +298,63 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, dax_wake_mapping_entry_waiter(mapping, index, entry, false); } +static unsigned long dax_entry_size(void *entry) +{ + if (dax_is_zero_entry(entry)) + return 0; + else if (dax_is_empty_entry(entry)) + return 0; + else if (dax_is_pmd_entry(entry)) + return PMD_SIZE; + else + return PAGE_SIZE; +} + +static unsigned long dax_radix_end_pfn(void *entry) +{ + return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; +} + +/* + * Iterate through all mapped pfns represented by an entry, i.e. skip + * 'empty' and 'zero' entries. + */ +#define for_each_mapped_pfn(entry, pfn) \ + for (pfn = dax_radix_pfn(entry); \ + pfn < dax_radix_end_pfn(entry); pfn++) + +static void dax_associate_entry(void *entry, struct address_space *mapping) +{ + unsigned long pfn; + + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return; + + for_each_mapped_pfn(entry, pfn) { + struct page *page = pfn_to_page(pfn); + + WARN_ON_ONCE(page->mapping); + page->mapping = mapping; + } +} + +static void dax_disassociate_entry(void *entry, struct address_space *mapping, + bool trunc) +{ + unsigned long pfn; + + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return; + + for_each_mapped_pfn(entry, pfn) { + struct page *page = pfn_to_page(pfn); + + WARN_ON_ONCE(trunc && page_ref_count(page) > 1); + WARN_ON_ONCE(page->mapping && page->mapping != mapping); + page->mapping = NULL; + } +} + /* * Find radix tree entry at given index. If it points to an exceptional entry, * return it with the radix tree entry locked. If the radix tree doesn't @@ -404,6 +461,7 @@ restart: } if (pmd_downgrade) { + dax_disassociate_entry(entry, mapping, false); radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(mapping, index, entry, @@ -453,6 +511,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) goto out; + dax_disassociate_entry(entry, mapping, trunc); radix_tree_delete(page_tree, index); mapping->nrexceptional--; ret = 1; @@ -547,6 +606,10 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, spin_lock_irq(&mapping->tree_lock); new_entry = dax_radix_locked_entry(pfn, flags); + if (dax_entry_size(entry) != dax_entry_size(new_entry)) { + dax_disassociate_entry(entry, mapping, false); + dax_associate_entry(new_entry, mapping); + } if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* -- cgit v1.2.3