From 2765cfbb342c727c3fd47b165196cb16da158022 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 18 Aug 2015 13:55:40 -0600 Subject: dax: update I/O path to do proper PMEM flushing Update the DAX I/O path so that all operations that store data (I/O writes, zeroing blocks, punching holes, etc.) properly synchronize the stores to media using the PMEM API. This ensures that the data DAX is writing is durable on media before the operation completes. Signed-off-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- fs/dax.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index c3e21ccfc358..e07fecc93f80 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -46,10 +47,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) unsigned pgsz = PAGE_SIZE - offset_in_page(addr); if (pgsz > count) pgsz = count; - if (pgsz < PAGE_SIZE) - memset(addr, 0, pgsz); - else - clear_page(addr); + clear_pmem((void __pmem *)addr, pgsz); addr += pgsz; size -= pgsz; count -= pgsz; @@ -59,6 +57,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) } } while (size); + wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_clear_blocks); @@ -70,15 +69,16 @@ static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); } +/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, loff_t end) { loff_t final = end - pos + first; /* The final byte of the buffer */ if (first > 0) - memset(addr, 0, first); + clear_pmem((void __pmem *)addr, first); if (final < size) - memset(addr + final, 0, size - final); + clear_pmem((void __pmem *)addr + final, size - final); } static bool buffer_written(struct buffer_head *bh) @@ -108,12 +108,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t bh_max = start; void *addr; bool hole = false; + bool need_wmb = false; if (iov_iter_rw(iter) != WRITE) end = min(end, i_size_read(inode)); while (pos < end) { - unsigned len; + size_t len; if (pos == max) { unsigned blkbits = inode->i_blkbits; sector_t block = pos >> blkbits; @@ -145,18 +146,22 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, retval = dax_get_addr(bh, &addr, blkbits); if (retval < 0) break; - if (buffer_unwritten(bh) || buffer_new(bh)) + if (buffer_unwritten(bh) || buffer_new(bh)) { dax_new_buf(addr, retval, first, pos, end); + need_wmb = true; + } addr += first; size = retval - first; } max = min(pos + size, end); } - if (iov_iter_rw(iter) == WRITE) - len = copy_from_iter_nocache(addr, max - pos, iter); - else if (!hole) + if (iov_iter_rw(iter) == WRITE) { + len = copy_from_iter_pmem((void __pmem *)addr, + max - pos, iter); + need_wmb = true; + } else if (!hole) len = copy_to_iter(addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); @@ -168,6 +173,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, addr += len; } + if (need_wmb) + wmb_pmem(); + return (pos == start) ? retval : pos - start; } @@ -303,8 +311,10 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, goto out; } - if (buffer_unwritten(bh) || buffer_new(bh)) - clear_page(addr); + if (buffer_unwritten(bh) || buffer_new(bh)) { + clear_pmem((void __pmem *)addr, PAGE_SIZE); + wmb_pmem(); + } error = vm_insert_mixed(vma, vaddr, pfn); @@ -542,7 +552,8 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, err = dax_get_addr(&bh, &addr, inode->i_blkbits); if (err < 0) return err; - memset(addr + offset, 0, length); + clear_pmem((void __pmem *)addr + offset, length); + wmb_pmem(); } return 0; -- cgit v1.2.3 From e2e05394e4a3420dab96f728df4531893494e15d Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 18 Aug 2015 13:55:41 -0600 Subject: pmem, dax: have direct_access use __pmem annotation Update the annotation for the kaddr pointer returned by direct_access() so that it is a __pmem pointer. This is consistent with the PMEM driver and with how this direct_access() pointer is used in the DAX code. Signed-off-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- Documentation/filesystems/Locking | 3 ++- arch/powerpc/sysdev/axonram.c | 7 ++++--- drivers/block/brd.c | 4 ++-- drivers/nvdimm/pmem.c | 4 ++-- drivers/s390/block/dcssblk.c | 10 ++++++---- fs/block_dev.c | 2 +- fs/dax.c | 37 ++++++++++++++++++++----------------- include/linux/blkdev.h | 8 ++++---- 8 files changed, 41 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 6a34a0f4d37c..06d443450f21 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -397,7 +397,8 @@ prototypes: int (*release) (struct gendisk *, fmode_t); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); + int (*direct_access) (struct block_device *, sector_t, void __pmem **, + unsigned long *); int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index ee90db17b097..a2be2a66dab6 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -141,13 +141,14 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) */ static long axon_ram_direct_access(struct block_device *device, sector_t sector, - void **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn, long size) { struct axon_ram_bank *bank = device->bd_disk->private_data; loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; + void *addr = (void *)(bank->ph_addr + offset); - *kaddr = (void *)(bank->ph_addr + offset); - *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; + *kaddr = (void __pmem *)addr; + *pfn = virt_to_phys(addr) >> PAGE_SHIFT; return bank->size - offset; } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 64ab4951e9d6..c96402fd1560 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -371,7 +371,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, #ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn, long size) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; @@ -381,7 +381,7 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector, page = brd_insert_page(brd, sector); if (!page) return -ENOSPC; - *kaddr = page_address(page); + *kaddr = (void __pmem *)page_address(page); *pfn = page_to_pfn(page); /* diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index eb7552d939e1..f3b629779266 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -92,7 +92,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, } static long pmem_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn, long size) { struct pmem_device *pmem = bdev->bd_disk->private_data; size_t offset = sector << 9; @@ -101,7 +101,7 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector, return -ENODEV; /* FIXME convert DAX to comprehend that this mapping has a lifetime */ - *kaddr = (void __force *) pmem->virt_addr + offset; + *kaddr = pmem->virt_addr + offset; *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; return pmem->size - offset; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index da212813f2d5..2c5a397b9f3e 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -29,7 +29,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static void dcssblk_make_request(struct request_queue *q, struct bio *bio); static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, - void **kaddr, unsigned long *pfn, long size); + void __pmem **kaddr, unsigned long *pfn, long size); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -879,18 +879,20 @@ fail: static long dcssblk_direct_access (struct block_device *bdev, sector_t secnum, - void **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn, long size) { struct dcssblk_dev_info *dev_info; unsigned long offset, dev_sz; + void *addr; dev_info = bdev->bd_disk->private_data; if (!dev_info) return -ENODEV; dev_sz = dev_info->end - dev_info->start; offset = secnum * 512; - *kaddr = (void *) (dev_info->start + offset); - *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; + addr = (void *) (dev_info->start + offset); + *pfn = virt_to_phys(addr) >> PAGE_SHIFT; + *kaddr = (void __pmem *) addr; return dev_sz - offset; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 198243717da5..2345a9870e2c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page); * accessible at this address. */ long bdev_direct_access(struct block_device *bdev, sector_t sector, - void **addr, unsigned long *pfn, long size) + void __pmem **addr, unsigned long *pfn, long size) { long avail; const struct block_device_operations *ops = bdev->bd_disk->fops; diff --git a/fs/dax.c b/fs/dax.c index e07fecc93f80..7c634ac797b1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -35,7 +35,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) might_sleep(); do { - void *addr; + void __pmem *addr; unsigned long pfn; long count; @@ -47,7 +47,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) unsigned pgsz = PAGE_SIZE - offset_in_page(addr); if (pgsz > count) pgsz = count; - clear_pmem((void __pmem *)addr, pgsz); + clear_pmem(addr, pgsz); addr += pgsz; size -= pgsz; count -= pgsz; @@ -62,7 +62,8 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) } EXPORT_SYMBOL_GPL(dax_clear_blocks); -static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, + unsigned blkbits) { unsigned long pfn; sector_t sector = bh->b_blocknr << (blkbits - 9); @@ -70,15 +71,15 @@ static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) } /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ -static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, - loff_t end) +static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, + loff_t pos, loff_t end) { loff_t final = end - pos + first; /* The final byte of the buffer */ if (first > 0) - clear_pmem((void __pmem *)addr, first); + clear_pmem(addr, first); if (final < size) - clear_pmem((void __pmem *)addr + final, size - final); + clear_pmem(addr + final, size - final); } static bool buffer_written(struct buffer_head *bh) @@ -106,7 +107,7 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t pos = start; loff_t max = start; loff_t bh_max = start; - void *addr; + void __pmem *addr; bool hole = false; bool need_wmb = false; @@ -158,11 +159,11 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, } if (iov_iter_rw(iter) == WRITE) { - len = copy_from_iter_pmem((void __pmem *)addr, - max - pos, iter); + len = copy_from_iter_pmem(addr, max - pos, iter); need_wmb = true; } else if (!hole) - len = copy_to_iter(addr, max - pos, iter); + len = copy_to_iter((void __force *)addr, max - pos, + iter); else len = iov_iter_zero(max - pos, iter); @@ -268,11 +269,13 @@ static int dax_load_hole(struct address_space *mapping, struct page *page, static int copy_user_bh(struct page *to, struct buffer_head *bh, unsigned blkbits, unsigned long vaddr) { - void *vfrom, *vto; + void __pmem *vfrom; + void *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) return -EIO; vto = kmap_atomic(to); - copy_user_page(vto, vfrom, vaddr, to); + copy_user_page(vto, (void __force *)vfrom, vaddr, to); kunmap_atomic(vto); return 0; } @@ -283,7 +286,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct address_space *mapping = inode->i_mapping; sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); unsigned long vaddr = (unsigned long)vmf->virtual_address; - void *addr; + void __pmem *addr; unsigned long pfn; pgoff_t size; int error; @@ -312,7 +315,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, } if (buffer_unwritten(bh) || buffer_new(bh)) { - clear_pmem((void __pmem *)addr, PAGE_SIZE); + clear_pmem(addr, PAGE_SIZE); wmb_pmem(); } @@ -548,11 +551,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, if (err < 0) return err; if (buffer_written(&bh)) { - void *addr; + void __pmem *addr; err = dax_get_addr(&bh, &addr, inode->i_blkbits); if (err < 0) return err; - clear_pmem((void __pmem *)addr + offset, length); + clear_pmem(addr + offset, length); wmb_pmem(); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d4068c17d0df..c401ecdff9cb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1555,8 +1555,8 @@ struct block_device_operations { int (*rw_page)(struct block_device *, sector_t, struct page *, int rw); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - long (*direct_access)(struct block_device *, sector_t, - void **, unsigned long *pfn, long size); + long (*direct_access)(struct block_device *, sector_t, void __pmem **, + unsigned long *pfn, long size); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ @@ -1574,8 +1574,8 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); -extern long bdev_direct_access(struct block_device *, sector_t, void **addr, - unsigned long *pfn, long size); +extern long bdev_direct_access(struct block_device *, sector_t, + void __pmem **addr, unsigned long *pfn, long size); #else /* CONFIG_BLOCK */ struct block_device; -- cgit v1.2.3 From cb389b9c0e00c30c9daf20287f7d91e2466edbb1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 Aug 2015 17:41:00 -0400 Subject: dax: drop size parameter to ->direct_access() None of the implementations currently use it. The common bdev_direct_access() entry point handles all the size checks before calling ->direct_access(). Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/powerpc/sysdev/axonram.c | 2 +- drivers/block/brd.c | 6 +----- drivers/nvdimm/pmem.c | 2 +- drivers/s390/block/dcssblk.c | 4 ++-- fs/block_dev.c | 2 +- include/linux/blkdev.h | 2 +- 6 files changed, 7 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index a2be2a66dab6..4419c84ac15a 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -141,7 +141,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) */ static long axon_ram_direct_access(struct block_device *device, sector_t sector, - void __pmem **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn) { struct axon_ram_bank *bank = device->bd_disk->private_data; loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c96402fd1560..03c45c41bdfa 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -371,7 +371,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, #ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; @@ -384,10 +384,6 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector, *kaddr = (void __pmem *)page_address(page); *pfn = page_to_pfn(page); - /* - * TODO: If size > PAGE_SIZE, we could look to see if the next page in - * the file happens to be mapped to the next page of physical RAM. - */ return PAGE_SIZE; } #else diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index f3b629779266..3b5b9cb758b6 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -92,7 +92,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, } static long pmem_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn) { struct pmem_device *pmem = bdev->bd_disk->private_data; size_t offset = sector << 9; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 2c5a397b9f3e..8c027a9e4e8a 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -29,7 +29,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static void dcssblk_make_request(struct request_queue *q, struct bio *bio); static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, - void __pmem **kaddr, unsigned long *pfn, long size); + void __pmem **kaddr, unsigned long *pfn); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -879,7 +879,7 @@ fail: static long dcssblk_direct_access (struct block_device *bdev, sector_t secnum, - void __pmem **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn) { struct dcssblk_dev_info *dev_info; unsigned long offset, dev_sz; diff --git a/fs/block_dev.c b/fs/block_dev.c index 2345a9870e2c..3831e5691b32 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -462,7 +462,7 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, addr, pfn, size); + avail = ops->direct_access(bdev, sector, addr, pfn); if (!avail) return -ERANGE; return min(avail, size); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c401ecdff9cb..c22064f326b2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1556,7 +1556,7 @@ struct block_device_operations { int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); long (*direct_access)(struct block_device *, sector_t, void __pmem **, - unsigned long *pfn, long size); + unsigned long *pfn); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ -- cgit v1.2.3