diff options
Diffstat (limited to 'fs')
261 files changed, 7086 insertions, 4574 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 6181ad79e1a5..5ca1fb0043f6 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -34,6 +34,7 @@ #include <linux/idr.h> #include <linux/sched.h> #include <linux/uio.h> +#include <linux/bvec.h> #include <net/9p/9p.h> #include <net/9p/client.h> diff --git a/fs/Kconfig b/fs/Kconfig index 884653fc6a8b..c2a377cdda2b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -55,7 +55,6 @@ config FS_DAX_PMD depends on FS_DAX depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE - depends on BROKEN endif # BLOCK diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 4c09d93d9569..b2f82cf6bf86 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -170,8 +170,8 @@ config BINFMT_MISC You can do other nice things, too. Read the file <file:Documentation/binfmt_misc.txt> to learn how to use this - feature, <file:Documentation/java.txt> for information about how - to include Java support. and <file:Documentation/mono.txt> for + feature, <file:Documentation/admin-guide/java.rst> for information about how + to include Java support. and <file:Documentation/admin-guide/mono.rst> for information about how to include Mono-based .NET support. To use binfmt_misc, you will need to mount it: diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2472af2798c7..e6c1bd443806 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL); + if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) + goto end_coredump; + vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz)); if (!vma_filesz) goto end_coredump; @@ -2311,7 +2313,7 @@ end_coredump: cleanup: free_note_info(&info); kfree(shdr4extnum); - kfree(vma_filesz); + vfree(vma_filesz); kfree(phdr4note); kfree(elf); out: diff --git a/fs/block_dev.c b/fs/block_dev.c index 05b553368bb4..7c4507224ed6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -30,6 +30,7 @@ #include <linux/cleancache.h> #include <linux/dax.h> #include <linux/badblocks.h> +#include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <asm/uaccess.h> #include "internal.h" @@ -175,16 +176,269 @@ static struct inode *bdev_file_inode(struct file *file) return file->f_mapping->host; } +static unsigned int dio_bio_write_op(struct kiocb *iocb) +{ + unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + + /* avoid the need for a I/O completion work item */ + if (iocb->ki_flags & IOCB_DSYNC) + op |= REQ_FUA; + return op; +} + +#define DIO_INLINE_BIO_VECS 4 + +static void blkdev_bio_end_io_simple(struct bio *bio) +{ + struct task_struct *waiter = bio->bi_private; + + WRITE_ONCE(bio->bi_private, NULL); + wake_up_process(waiter); +} + static ssize_t -blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, + int nr_pages) +{ + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec; + loff_t pos = iocb->ki_pos; + bool should_dirty = false; + struct bio bio; + ssize_t ret; + blk_qc_t qc; + int i; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + if (nr_pages <= DIO_INLINE_BIO_VECS) + vecs = inline_vecs; + else { + vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL); + if (!vecs) + return -ENOMEM; + } + + bio_init(&bio, vecs, nr_pages); + bio.bi_bdev = bdev; + bio.bi_iter.bi_sector = pos >> 9; + bio.bi_private = current; + bio.bi_end_io = blkdev_bio_end_io_simple; + + ret = bio_iov_iter_get_pages(&bio, iter); + if (unlikely(ret)) + return ret; + ret = bio.bi_iter.bi_size; + + if (iov_iter_rw(iter) == READ) { + bio.bi_opf = REQ_OP_READ; + if (iter_is_iovec(iter)) + should_dirty = true; + } else { + bio.bi_opf = dio_bio_write_op(iocb); + task_io_account_write(ret); + } + + qc = submit_bio(&bio); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio.bi_private)) + break; + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_mq_poll(bdev_get_queue(bdev), qc)) + io_schedule(); + } + __set_current_state(TASK_RUNNING); + + bio_for_each_segment_all(bvec, &bio, i) { + if (should_dirty && !PageCompound(bvec->bv_page)) + set_page_dirty_lock(bvec->bv_page); + put_page(bvec->bv_page); + } + + if (vecs != inline_vecs) + kfree(vecs); + + if (unlikely(bio.bi_error)) + return bio.bi_error; + return ret; +} + +struct blkdev_dio { + union { + struct kiocb *iocb; + struct task_struct *waiter; + }; + size_t size; + atomic_t ref; + bool multi_bio : 1; + bool should_dirty : 1; + bool is_sync : 1; + struct bio bio; +}; + +static struct bio_set *blkdev_dio_pool __read_mostly; + +static void blkdev_bio_end_io(struct bio *bio) +{ + struct blkdev_dio *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + + if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { + if (bio->bi_error && !dio->bio.bi_error) + dio->bio.bi_error = bio->bi_error; + } else { + if (!dio->is_sync) { + struct kiocb *iocb = dio->iocb; + ssize_t ret = dio->bio.bi_error; + + if (likely(!ret)) { + ret = dio->size; + iocb->ki_pos += ret; + } + + dio->iocb->ki_complete(iocb, ret, 0); + bio_put(&dio->bio); + } else { + struct task_struct *waiter = dio->waiter; + + WRITE_ONCE(dio->waiter, NULL); + wake_up_process(waiter); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); + bio_put(bio); + } +} + +static ssize_t +__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) { struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(inode); + struct blkdev_dio *dio; + struct bio *bio; + bool is_read = (iov_iter_rw(iter) == READ); + loff_t pos = iocb->ki_pos; + blk_qc_t qc = BLK_QC_T_NONE; + int ret; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool); + bio_get(bio); /* extra ref for the completion handler */ + + dio = container_of(bio, struct blkdev_dio, bio); + dio->is_sync = is_sync_kiocb(iocb); + if (dio->is_sync) + dio->waiter = current; + else + dio->iocb = iocb; + + dio->size = 0; + dio->multi_bio = false; + dio->should_dirty = is_read && (iter->type == ITER_IOVEC); + + for (;;) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = pos >> 9; + bio->bi_private = dio; + bio->bi_end_io = blkdev_bio_end_io; + + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_error = ret; + bio_endio(bio); + break; + } + + if (is_read) { + bio->bi_opf = REQ_OP_READ; + if (dio->should_dirty) + bio_set_pages_dirty(bio); + } else { + bio->bi_opf = dio_bio_write_op(iocb); + task_io_account_write(bio->bi_iter.bi_size); + } + + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); + if (!nr_pages) { + qc = submit_bio(bio); + break; + } + + if (!dio->multi_bio) { + dio->multi_bio = true; + atomic_set(&dio->ref, 2); + } else { + atomic_inc(&dio->ref); + } - return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, - blkdev_get_block, NULL, NULL, - DIO_SKIP_DIO_COUNT); + submit_bio(bio); + bio = bio_alloc(GFP_KERNEL, nr_pages); + } + + if (!dio->is_sync) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_mq_poll(bdev_get_queue(bdev), qc)) + io_schedule(); + } + __set_current_state(TASK_RUNNING); + + ret = dio->bio.bi_error; + if (likely(!ret)) + ret = dio->size; + + bio_put(&dio->bio); + return ret; +} + +static ssize_t +blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + int nr_pages; + + nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); + if (!nr_pages) + return 0; + if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + + return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES)); +} + +static __init int blkdev_init(void) +{ + blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio)); + if (!blkdev_dio_pool) + return -ENOMEM; + return 0; } +module_init(blkdev_init); int __sync_blockdev(struct block_device *bdev, int wait) { @@ -832,7 +1086,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, return true; /* already a holder */ else if (bdev->bd_holder != NULL) return false; /* held by someone else */ - else if (bdev->bd_contains == bdev) + else if (whole == bdev) return true; /* is a whole device which isn't held */ else if (whole->bd_holder == bd_may_claim) @@ -1950,6 +2204,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; + struct block_device *bdev; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || @@ -1970,8 +2225,12 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) */ iput(old_inode); old_inode = inode; + bdev = I_BDEV(inode); - func(I_BDEV(inode), arg); + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers) + func(bdev, arg); + mutex_unlock(&bdev->bd_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3a57f99d96aa..fe10afd51e02 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -930,7 +930,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (bio->bi_opf & REQ_SYNC) + if (op_is_sync(bio->bi_opf)) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); @@ -3485,9 +3485,9 @@ static int write_dev_supers(struct btrfs_device *device, * to go down lazy. */ if (i == 0) - ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh); else - ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); if (ret) errors++; } @@ -3551,7 +3551,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8ed05d95584a..1e67723c27a1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -127,7 +127,7 @@ struct extent_page_data { */ unsigned int extent_locked:1; - /* tells the submit_bio code to use a WRITE_SYNC */ + /* tells the submit_bio code to use REQ_SYNC */ unsigned int sync_io:1; }; @@ -2047,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } bio->bi_bdev = dev->bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; bio_add_page(bio, page, length, pg_offset); if (btrfsic_submit_bio_wait(bio)) { @@ -2388,7 +2388,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct inode *inode = page->mapping->host; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct bio *bio; - int read_mode; + int read_mode = 0; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2404,9 +2404,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, } if (failed_bio->bi_vcnt > 1) - read_mode = READ_SYNC | REQ_FAILFAST_DEV; - else - read_mode = READ_SYNC; + read_mode |= REQ_FAILFAST_DEV; phy_offset >>= inode->i_sb->s_blocksize_bits; bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, @@ -3484,7 +3482,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC; + write_flags = REQ_SYNC; trace___extent_writepage(page, inode, wbc); @@ -3729,7 +3727,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, unsigned long i, num_pages; unsigned long bio_flags = 0; unsigned long start, end; - int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META; + int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); @@ -4076,7 +4074,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) int ret; bio_set_op_attrs(epd->bio, REQ_OP_WRITE, - epd->sync_io ? WRITE_SYNC : 0); + epd->sync_io ? REQ_SYNC : 0); ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8e3a5a266917..a4c879671b9d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7917,7 +7917,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec; struct bio *bio; int isector; - int read_mode; + int read_mode = 0; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -7936,9 +7936,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, if ((failed_bio->bi_vcnt > 1) || (failed_bio->bi_io_vec->bv_len > BTRFS_I(inode)->root->sectorsize)) - read_mode = READ_SYNC | REQ_FAILFAST_DEV; - else - read_mode = READ_SYNC; + read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; isector >>= inode->i_sb->s_blocksize_bits; @@ -8427,7 +8425,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); + bio->bi_opf = orig_bio->bi_opf; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8465,8 +8463,7 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), - bio_flags(orig_bio)); + bio->bi_opf = orig_bio->bi_opf; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fffb9ab8526e..ff3078234d94 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -4440,7 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { leave_with_eio: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 74ed5aae6cea..3b713b6fcc26 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -202,27 +202,25 @@ static struct ratelimit_state printk_limits[] = { void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; - char lvl[4]; + char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; va_list args; - const char *type = logtypes[4]; int kern_level; - struct ratelimit_state *ratelimit; + const char *type = logtypes[4]; + struct ratelimit_state *ratelimit = &printk_limits[4]; va_start(args, fmt); - kern_level = printk_get_level(fmt); - if (kern_level) { + while ((kern_level = printk_get_level(fmt)) != 0) { size_t size = printk_skip_level(fmt) - fmt; - memcpy(lvl, fmt, size); - lvl[size] = '\0'; + + if (kern_level >= '0' && kern_level <= '7') { + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + type = logtypes[kern_level - '0']; + ratelimit = &printk_limits[kern_level - '0']; + } fmt += size; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } else { - *lvl = '\0'; - /* Default to debug output */ - ratelimit = &printk_limits[7]; } vaf.fmt = fmt; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index bf62ad919a95..00ee006a8aa2 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -162,6 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) slot = radix_tree_iter_retry(&iter); continue; } + slot = radix_tree_iter_resume(slot, &iter); spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); spin_lock(&fs_info->buffer_lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 71a60cc01451..0d7d635d8bfb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6023,7 +6023,7 @@ static void btrfs_end_bio(struct bio *bio) else btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_READ_ERRS); - if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH) + if (bio->bi_opf & REQ_PREFLUSH) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_FLUSH_ERRS); btrfs_dev_stat_print_on_error(dev); @@ -6100,7 +6100,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, bio->bi_next = NULL; spin_lock(&device->io_lock); - if (bio->bi_opf & REQ_SYNC) + if (op_is_sync(bio->bi_opf)) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 09ed29c67848..f137ffe6654c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -62,7 +62,7 @@ struct btrfs_device { int running_pending; /* regular prio bios */ struct btrfs_pending_bios pending_bios; - /* WRITE_SYNC bios */ + /* sync bios */ struct btrfs_pending_bios pending_sync_bios; struct block_device *bdev; diff --git a/fs/buffer.c b/fs/buffer.c index b205a629001d..d21771fcf7d3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -43,6 +43,7 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <linux/pagevec.h> #include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -753,7 +754,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * still in flight on potentially older * contents. */ - write_dirty_buffer(bh, WRITE_SYNC); + write_dirty_buffer(bh, REQ_SYNC); /* * Kick off IO for the previous mapping. Note @@ -1604,37 +1605,80 @@ void create_empty_buffers(struct page *page, } EXPORT_SYMBOL(create_empty_buffers); -/* - * We are taking a block for data and we don't want any output from any - * buffer-cache aliases starting from return from that function and - * until the moment when something will explicitly mark the buffer - * dirty (hopefully that will not happen until we will free that block ;-) - * We don't even need to mark it not-uptodate - nobody can expect - * anything from a newly allocated buffer anyway. We used to used - * unmap_buffer() for such invalidation, but that was wrong. We definitely - * don't want to mark the alias unmapped, for example - it would confuse - * anyone who might pick it with bread() afterwards... - * - * Also.. Note that bforget() doesn't lock the buffer. So there can - * be writeout I/O going on against recently-freed buffers. We don't - * wait on that I/O in bforget() - it's more efficient to wait on the I/O - * only if we really need to. That happens here. - */ -void unmap_underlying_metadata(struct block_device *bdev, sector_t block) +/** + * clean_bdev_aliases: clean a range of buffers in block device + * @bdev: Block device to clean buffers in + * @block: Start of a range of blocks to clean + * @len: Number of blocks to clean + * + * We are taking a range of blocks for data and we don't want writeback of any + * buffer-cache aliases starting from return from this function and until the + * moment when something will explicitly mark the buffer dirty (hopefully that + * will not happen until we will free that block ;-) We don't even need to mark + * it not-uptodate - nobody can expect anything from a newly allocated buffer + * anyway. We used to use unmap_buffer() for such invalidation, but that was + * wrong. We definitely don't want to mark the alias unmapped, for example - it + * would confuse anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can be + * writeout I/O going on against recently-freed buffers. We don't wait on that + * I/O in bforget() - it's more efficient to wait on the I/O only if we really + * need to. That happens here. + */ +void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) { - struct buffer_head *old_bh; + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; + struct pagevec pvec; + pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); + pgoff_t end; + int i; + struct buffer_head *bh; + struct buffer_head *head; - might_sleep(); + end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); + pagevec_init(&pvec, 0); + while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; - old_bh = __find_get_block_slow(bdev, block); - if (old_bh) { - clear_buffer_dirty(old_bh); - wait_on_buffer(old_bh); - clear_buffer_req(old_bh); - __brelse(old_bh); + index = page->index; + if (index > end) + break; + if (!page_has_buffers(page)) + continue; + /* + * We use page lock instead of bd_mapping->private_lock + * to pin buffers here since we can afford to sleep and + * it scales better than a global spinlock lock. + */ + lock_page(page); + /* Recheck when the page is locked which pins bhs */ + if (!page_has_buffers(page)) + goto unlock_page; + head = page_buffers(page); + bh = head; + do { + if (!buffer_mapped(bh)) + goto next; + if (bh->b_blocknr >= block + len) + break; + clear_buffer_dirty(bh); + wait_on_buffer(bh); + clear_buffer_req(bh); +next: + bh = bh->b_this_page; + } while (bh != head); +unlock_page: + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + index++; } } -EXPORT_SYMBOL(unmap_underlying_metadata); +EXPORT_SYMBOL(clean_bdev_aliases); /* * Size is a power-of-two in the range 512..PAGE_SIZE, @@ -1684,7 +1728,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode * * prevents this contention from occurring. * * If block_write_full_page() is called with wbc->sync_mode == - * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this + * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ int __block_write_full_page(struct inode *inode, struct page *page, @@ -1697,7 +1741,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); + int write_flags = wbc_to_write_flags(wbc); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1745,8 +1789,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); } } bh = bh->b_this_page; @@ -1992,8 +2035,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, } if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -2633,7 +2675,7 @@ int nobh_write_begin(struct address_space *mapping, if (!buffer_mapped(bh)) is_mapped_to_disk = 0; if (buffer_new(bh)) - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { set_buffer_uptodate(bh); continue; @@ -3118,7 +3160,7 @@ EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) * @op: whether to %READ or %WRITE - * @op_flags: rq_flag_bits + * @op_flags: req_flag_bits * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * @@ -3210,7 +3252,7 @@ EXPORT_SYMBOL(__sync_dirty_buffer); int sync_dirty_buffer(struct buffer_head *bh) { - return __sync_dirty_buffer(bh, WRITE_SYNC); + return __sync_dirty_buffer(bh, REQ_SYNC); } EXPORT_SYMBOL(sync_dirty_buffer); @@ -3403,7 +3445,7 @@ void free_buffer_head(struct buffer_head *bh) } EXPORT_SYMBOL(free_buffer_head); -static void buffer_exit_cpu(int cpu) +static int buffer_exit_cpu_dead(unsigned int cpu) { int i; struct bh_lru *b = &per_cpu(bh_lrus, cpu); @@ -3414,14 +3456,7 @@ static void buffer_exit_cpu(int cpu) } this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr); per_cpu(bh_accounting, cpu).nr = 0; -} - -static int buffer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) - buffer_exit_cpu((unsigned long)hcpu); - return NOTIFY_OK; + return 0; } /** @@ -3471,6 +3506,7 @@ EXPORT_SYMBOL(bh_submit_read); void __init buffer_init(void) { unsigned long nrpages; + int ret; bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), 0, @@ -3483,5 +3519,7 @@ void __init buffer_init(void) */ nrpages = (nr_free_buffer_pages() * 10) / 100; max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); - hotcpu_notifier(buffer_cpu_notify, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead", + NULL, buffer_exit_cpu_dead); + WARN_ON(ret < 0); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index c23eb0e9348c..d7a93696663b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1236,26 +1236,30 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; op = ceph_snap(dir) == CEPH_SNAPDIR ? - CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); if (!IS_ERR(req)) { req->r_dentry = dget(dentry); - req->r_num_caps = 2; + req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2; mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) mask |= CEPH_CAP_XATTR_SHARED; req->r_args.getattr.mask = mask; - req->r_locked_dir = dir; err = ceph_mdsc_do_request(mdsc, NULL, req); - if (err == 0 || err == -ENOENT) { - if (dentry == req->r_dentry) { - valid = !d_unhashed(dentry); - } else { - d_invalidate(req->r_dentry); - err = -EAGAIN; - } + switch (err) { + case 0: + if (d_really_is_positive(dentry) && + d_inode(dentry) == req->r_target_inode) + valid = 1; + break; + case -ENOENT: + if (d_really_is_negative(dentry)) + valid = 1; + /* Fallthrough */ + default: + break; } ceph_mdsc_put_request(req); dout("d_revalidate %p lookup result=%d\n", diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 8347c90cf483..5eb04129f938 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -808,7 +808,11 @@ calc_seckey(struct cifs_ses *ses) struct crypto_skcipher *tfm_arc4; struct scatterlist sgin, sgout; struct skcipher_request *req; - unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */ + unsigned char *sec_key; + + sec_key = kmalloc(CIFS_SESS_KEY_SIZE, GFP_KERNEL); + if (sec_key == NULL) + return -ENOMEM; get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); @@ -816,7 +820,7 @@ calc_seckey(struct cifs_ses *ses) if (IS_ERR(tfm_arc4)) { rc = PTR_ERR(tfm_arc4); cifs_dbg(VFS, "could not allocate crypto API arc4\n"); - return rc; + goto out; } rc = crypto_skcipher_setkey(tfm_arc4, ses->auth_key.response, @@ -854,7 +858,8 @@ calc_seckey(struct cifs_ses *ses) out_free_cipher: crypto_free_skcipher(tfm_arc4); - +out: + kfree(sec_key); return rc; } diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 3f3185febc58..e3fed9249a04 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3427,6 +3427,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, __u16 rc = 0; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; struct posix_acl_xattr_header *local_acl = (void *)pACL; + struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); int count; int i; @@ -3453,8 +3454,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, return 0; } for (i = 0; i < count; i++) { - rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], - (struct posix_acl_xattr_entry *)(local_acl + 1)); + rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]); if (rc != 0) { /* ACE not converted */ break; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index aab5227979e2..f7563c88c917 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -41,6 +41,7 @@ #include <keys/user-type.h> #include <net/ipv6.h> #include <linux/parser.h> +#include <linux/bvec.h> #include "cifspdu.h" #include "cifsglob.h" @@ -412,6 +413,9 @@ cifs_reconnect(struct TCP_Server_Info *server) } } while (server->tcpStatus == CifsNeedReconnect); + if (server->tcpStatus == CifsNeedNegotiate) + mod_delayed_work(cifsiod_wq, &server->echo, 0); + return rc; } @@ -421,17 +425,25 @@ cifs_echo_request(struct work_struct *work) int rc; struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); - unsigned long echo_interval = server->echo_interval; + unsigned long echo_interval; + + /* + * If we need to renegotiate, set echo interval to zero to + * immediately call echo service where we can renegotiate. + */ + if (server->tcpStatus == CifsNeedNegotiate) + echo_interval = 0; + else + echo_interval = server->echo_interval; /* - * We cannot send an echo if it is disabled or until the - * NEGOTIATE_PROTOCOL request is done, which is indicated by - * server->ops->need_neg() == true. Also, no need to ping if - * we got a response recently. + * We cannot send an echo if it is disabled. + * Also, no need to ping if we got a response recently. */ if (server->tcpStatus == CifsNeedReconnect || - server->tcpStatus == CifsExiting || server->tcpStatus == CifsNew || + server->tcpStatus == CifsExiting || + server->tcpStatus == CifsNew || (server->ops->can_echo && !server->ops->can_echo(server)) || time_before(jiffies, server->lstrp + echo_interval - HZ)) goto requeue_echo; @@ -442,7 +454,7 @@ cifs_echo_request(struct work_struct *work) server->hostname); requeue_echo: - queue_delayed_work(cifsiod_wq, &server->echo, echo_interval); + queue_delayed_work(cifsiod_wq, &server->echo, server->echo_interval); } static bool diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 206a597b2293..5f02edc819af 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -28,6 +28,7 @@ #include <linux/delay.h> #include <linux/freezer.h> #include <linux/tcp.h> +#include <linux/bvec.h> #include <linux/highmem.h> #include <asm/uaccess.h> #include <asm/processor.h> diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 92348faf9865..f514978f6688 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -8,9 +8,7 @@ config FS_ENCRYPTION select CRYPTO_XTS select CRYPTO_CTS select CRYPTO_CTR - select CRYPTO_SHA256 select KEYS - select ENCRYPTED_KEYS help Enable encryption of files and directories. This feature is similar to ecryptfs, but it is more memory diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 98f87fe8f186..ac8e4f6a3773 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -27,7 +27,7 @@ #include <linux/bio.h> #include <linux/dcache.h> #include <linux/namei.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx) { unsigned long flags; - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); ctx->w.bounce_page = NULL; } @@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx); * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) } else { ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; } - ctx->flags &= ~FS_WRITE_PATH_FL; + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx; } EXPORT_SYMBOL(fscrypt_get_ctx); @@ -146,9 +146,10 @@ typedef enum { FS_ENCRYPT, } fscrypt_direction_t; -static int do_page_crypto(struct inode *inode, - fscrypt_direction_t rw, pgoff_t index, +static int do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, struct page *dest_page, + unsigned int len, unsigned int offs, gfp_t gfp_flags) { struct { @@ -162,6 +163,8 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; + BUG_ON(len == 0); + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -175,14 +178,14 @@ static int do_page_crypto(struct inode *inode, page_crypt_complete, &ecr); BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(index); + xts_tweak.index = cpu_to_le64(lblk_num); memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -207,34 +210,66 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); - ctx->flags |= FS_WRITE_PATH_FL; + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx->w.bounce_page; } /** * fscypt_encrypt_page() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * @gfp_flags: The gfp flag for memory allocation + * @inode: The inode for which the encryption should take place + * @page: The page to encrypt. Must be locked for bounce-page + * encryption. + * @len: Length of data to encrypt in @page and encrypted + * data in returned page. + * @offs: Offset of data within @page and returned + * page holding encrypted data. + * @lblk_num: Logical block number. This must be unique for multiple + * calls with same inode, except when overwriting + * previously written data. + * @gfp_flags: The gfp flag for memory allocation * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. + * Encrypts @page using the ctx encryption context. Performs encryption + * either in-place or into a newly allocated bounce page. + * Called on the page write path. * - * Called on the page write path. The caller must call + * Bounce page allocation is the default. + * In this case, the contents of @page are encrypted and stored in an + * allocated bounce page. @page has to be locked and the caller must call * fscrypt_restore_control_page() on the returned ciphertext page to * release the bounce buffer and the encryption context. * - * Return: An allocated page with the encrypted content on success. Else, an + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in + * fscrypt_operations. Here, the input-page is returned with its content + * encrypted. + * + * Return: A page with the encrypted content on success. Else, an * error value or NULL. */ -struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page, gfp_t gfp_flags) +struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) + { struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; + struct page *ciphertext_page = page; int err; - BUG_ON(!PageLocked(plaintext_page)); + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); + + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { + /* with inplace-encryption we just encrypt the page */ + err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, + len, offs, gfp_flags); + if (err) + return ERR_PTR(err); + + return ciphertext_page; + } + + BUG_ON(!PageLocked(page)); ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) @@ -245,10 +280,10 @@ struct page *fscrypt_encrypt_page(struct inode *inode, if (IS_ERR(ciphertext_page)) goto errout; - ctx->w.control_page = plaintext_page; - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, - gfp_flags); + ctx->w.control_page = page; + err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, + len, offs, gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -265,8 +300,13 @@ errout: EXPORT_SYMBOL(fscrypt_encrypt_page); /** - * f2crypt_decrypt_page() - Decrypts a page in-place - * @page: The page to decrypt. Must be locked. + * fscrypt_decrypt_page() - Decrypts a page in-place + * @inode: The corresponding inode for the page to decrypt. + * @page: The page to decrypt. Must be locked in case + * it is a writeback page (FS_CFLG_OWN_PAGES unset). + * @len: Number of bytes in @page to be decrypted. + * @offs: Start of data in @page. + * @lblk_num: Logical block number. * * Decrypts page in-place using the ctx encryption context. * @@ -274,16 +314,18 @@ EXPORT_SYMBOL(fscrypt_encrypt_page); * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_decrypt_page(struct page *page) +int fscrypt_decrypt_page(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, u64 lblk_num) { - BUG_ON(!PageLocked(page)); + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) + BUG_ON(!PageLocked(page)); - return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page, GFP_NOFS); + return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len, + offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { struct fscrypt_ctx *ctx; @@ -306,7 +348,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, while (len--) { err = do_page_crypto(inode, FS_ENCRYPT, lblk, ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); + PAGE_SIZE, 0, GFP_NOFS); if (err) goto errout; @@ -414,7 +456,8 @@ static void completion_pages(struct work_struct *work) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page); + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); if (ret) { WARN_ON_ONCE(1); @@ -482,17 +525,22 @@ static void fscrypt_destroy(void) /** * fscrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: fscrypt operations flags * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_initialize(void) +int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) + /* + * No need to allocate a bounce page pool if there already is one or + * this FS won't use it. + */ + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) return 0; mutex_lock(&fscrypt_init_mutex); @@ -521,7 +569,6 @@ fail: mutex_unlock(&fscrypt_init_mutex); return res; } -EXPORT_SYMBOL(fscrypt_initialize); /** * fscrypt_init() - Set up for fs encryption. diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9b774f4b50c8..56ad9d195f18 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,7 +12,7 @@ #include <linux/scatterlist.h> #include <linux/ratelimit.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" /** * fname_crypt_complete() - completion callback for filename crypto @@ -209,7 +209,7 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) +u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) { int padding = 32; struct fscrypt_info *ci = inode->i_crypt_info; @@ -227,7 +227,7 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int fscrypt_fname_alloc_buffer(struct inode *inode, +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); @@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = get_crypt_info(dir); + ret = fscrypt_get_crypt_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h new file mode 100644 index 000000000000..aeab032d7d35 --- /dev/null +++ b/fs/crypto/fscrypt_private.h @@ -0,0 +1,93 @@ +/* + * fscrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#ifndef _FSCRYPT_PRIVATE_H +#define _FSCRYPT_PRIVATE_H + +#include <linux/fscrypto.h> + +#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/* Encryption parameters */ +#define FS_XTS_TWEAK_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 +#define FS_MAX_KEY_SIZE 64 + +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +/* This is passed in from userspace into the kernel keyring */ +struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; +} __packed; + +/* + * A pointer to this structure is stored in the file system's in-core + * representation of an inode. + */ +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct key *ci_keyring_key; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + + +/* crypto.c */ +int fscrypt_initialize(unsigned int cop_flags); + +/* keyinfo.c */ +extern int fscrypt_get_crypt_info(struct inode *); + +#endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 67fb6d8876d0..6eeea1dcba41 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,7 +10,7 @@ #include <keys/user-type.h> #include <linux/scatterlist.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" static void derive_crypt_complete(struct crypto_async_request *req, int rc) { @@ -178,7 +178,7 @@ static void put_crypt_info(struct fscrypt_info *ci) kmem_cache_free(fscrypt_info_cachep, ci); } -int get_crypt_info(struct inode *inode) +int fscrypt_get_crypt_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -188,7 +188,7 @@ int get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; - res = fscrypt_initialize(); + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; @@ -327,7 +327,7 @@ int fscrypt_get_encryption_info(struct inode *inode) (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))))) - return get_crypt_info(inode); + return fscrypt_get_crypt_info(inode); return 0; } EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6865663aac69..6ed7c2eebeec 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,8 +10,8 @@ #include <linux/random.h> #include <linux/string.h> -#include <linux/fscrypto.h> #include <linux/mount.h> +#include "fscrypt_private.h" static int inode_has_encryption_context(struct inode *inode) { @@ -93,16 +93,19 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct file *filp, - const struct fscrypt_policy *policy) +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + if (copy_from_user(&policy, arg, sizeof(policy))) + return -EFAULT; + if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy->version != 0) + if (policy.version != 0) return -EINVAL; ret = mnt_want_write_file(filp); @@ -120,9 +123,9 @@ int fscrypt_process_policy(struct file *filp, ret = -ENOTEMPTY; else ret = create_encryption_context_from_policy(inode, - policy); + &policy); } else if (!is_encryption_context_consistent_with_policy(inode, - policy)) { + &policy)) { printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", __func__); @@ -134,11 +137,13 @@ int fscrypt_process_policy(struct file *filp, mnt_drop_write_file(filp); return ret; } -EXPORT_SYMBOL(fscrypt_process_policy); +EXPORT_SYMBOL(fscrypt_ioctl_set_policy); -int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { + struct inode *inode = file_inode(filp); struct fscrypt_context ctx; + struct fscrypt_policy policy; int res; if (!inode->i_sb->s_cop->get_context || @@ -151,15 +156,18 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + policy.version = 0; + policy.contents_encryption_mode = ctx.contents_encryption_mode; + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; + policy.flags = ctx.flags; + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); + + if (copy_to_user(arg, &policy, sizeof(policy))) + return -EFAULT; return 0; } -EXPORT_SYMBOL(fscrypt_get_policy); +EXPORT_SYMBOL(fscrypt_ioctl_get_policy); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { @@ -31,28 +31,15 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/mmu_notifier.h> #include <linux/iomap.h> #include "internal.h" -/* - * We use lowest available bit in exceptional entry for locking, other two - * bits to determine entry type. In total 3 special bits. - */ -#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) -#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) -#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) -#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) -#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) -#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ - RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ - RADIX_TREE_EXCEPTIONAL_ENTRY)) - /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) -wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; +static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) { @@ -64,14 +51,6 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, - pgoff_t index) -{ - unsigned long hash = hash_long((unsigned long)mapping ^ index, - DAX_WAIT_TABLE_BITS); - return wait_table + hash; -} - static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { struct request_queue *q = bdev->bd_queue; @@ -98,209 +77,52 @@ static void dax_unmap_atomic(struct block_device *bdev, blk_queue_exit(bdev->bd_queue); } -struct page *read_dax_sector(struct block_device *bdev, sector_t n) +static int dax_is_pmd_entry(void *entry) { - struct page *page = alloc_pages(GFP_KERNEL, 0); - struct blk_dax_ctl dax = { - .size = PAGE_SIZE, - .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), - }; - long rc; - - if (!page) - return ERR_PTR(-ENOMEM); - - rc = dax_map_atomic(bdev, &dax); - if (rc < 0) - return ERR_PTR(rc); - memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); - dax_unmap_atomic(bdev, &dax); - return page; + return (unsigned long)entry & RADIX_DAX_PMD; } -static bool buffer_written(struct buffer_head *bh) +static int dax_is_pte_entry(void *entry) { - return buffer_mapped(bh) && !buffer_unwritten(bh); + return !((unsigned long)entry & RADIX_DAX_PMD); } -/* - * When ext4 encounters a hole, it returns without modifying the buffer_head - * which means that we can't trust b_size. To cope with this, we set b_state - * to 0 before calling get_block and, if any bit is set, we know we can trust - * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is - * and would save us time calling get_block repeatedly. - */ -static bool buffer_size_valid(struct buffer_head *bh) +static int dax_is_zero_entry(void *entry) { - return bh->b_state != 0; + return (unsigned long)entry & RADIX_DAX_HZP; } - -static sector_t to_sector(const struct buffer_head *bh, - const struct inode *inode) +static int dax_is_empty_entry(void *entry) { - sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); - - return sector; + return (unsigned long)entry & RADIX_DAX_EMPTY; } -static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, - loff_t start, loff_t end, get_block_t get_block, - struct buffer_head *bh) +struct page *read_dax_sector(struct block_device *bdev, sector_t n) { - loff_t pos = start, max = start, bh_max = start; - bool hole = false; - struct block_device *bdev = NULL; - int rw = iov_iter_rw(iter), rc; - long map_len = 0; + struct page *page = alloc_pages(GFP_KERNEL, 0); struct blk_dax_ctl dax = { - .addr = ERR_PTR(-EIO), + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), }; - unsigned blkbits = inode->i_blkbits; - sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) - >> blkbits; - - if (rw == READ) - end = min(end, i_size_read(inode)); - - while (pos < end) { - size_t len; - if (pos == max) { - long page = pos >> PAGE_SHIFT; - sector_t block = page << (PAGE_SHIFT - blkbits); - unsigned first = pos - (block << blkbits); - long size; - - if (pos == bh_max) { - bh->b_size = PAGE_ALIGN(end - pos); - bh->b_state = 0; - rc = get_block(inode, block, bh, rw == WRITE); - if (rc) - break; - if (!buffer_size_valid(bh)) - bh->b_size = 1 << blkbits; - bh_max = pos - first + bh->b_size; - bdev = bh->b_bdev; - /* - * We allow uninitialized buffers for writes - * beyond EOF as those cannot race with faults - */ - WARN_ON_ONCE( - (buffer_new(bh) && block < file_blks) || - (rw == WRITE && buffer_unwritten(bh))); - } else { - unsigned done = bh->b_size - - (bh_max - (pos - first)); - bh->b_blocknr += done >> blkbits; - bh->b_size -= done; - } - - hole = rw == READ && !buffer_written(bh); - if (hole) { - size = bh->b_size - first; - } else { - dax_unmap_atomic(bdev, &dax); - dax.sector = to_sector(bh, inode); - dax.size = bh->b_size; - map_len = dax_map_atomic(bdev, &dax); - if (map_len < 0) { - rc = map_len; - break; - } - dax.addr += first; - size = map_len - first; - } - /* - * pos + size is one past the last offset for IO, - * so pos + size can overflow loff_t at extreme offsets. - * Cast to u64 to catch this and get the true minimum. - */ - max = min_t(u64, pos + size, end); - } - - if (iov_iter_rw(iter) == WRITE) { - len = copy_from_iter_pmem(dax.addr, max - pos, iter); - } else if (!hole) - len = copy_to_iter((void __force *) dax.addr, max - pos, - iter); - else - len = iov_iter_zero(max - pos, iter); - - if (!len) { - rc = -EFAULT; - break; - } + long rc; - pos += len; - if (!IS_ERR(dax.addr)) - dax.addr += len; - } + if (!page) + return ERR_PTR(-ENOMEM); + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); dax_unmap_atomic(bdev, &dax); - - return (pos == start) ? rc : pos - start; -} - -/** - * dax_do_io - Perform I/O to a DAX file - * @iocb: The control block for this I/O - * @inode: The file which the I/O is directed at - * @iter: The addresses to do I/O from or to - * @get_block: The filesystem method used to translate file offsets to blocks - * @end_io: A filesystem callback for I/O completion - * @flags: See below - * - * This function uses the same locking scheme as do_blockdev_direct_IO: - * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the - * caller for writes. For reads, we take and release the i_mutex ourselves. - * If DIO_LOCKING is not set, the filesystem takes care of its own locking. - * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O - * is in progress. - */ -ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, - struct iov_iter *iter, get_block_t get_block, - dio_iodone_t end_io, int flags) -{ - struct buffer_head bh; - ssize_t retval = -EINVAL; - loff_t pos = iocb->ki_pos; - loff_t end = pos + iov_iter_count(iter); - - memset(&bh, 0, sizeof(bh)); - bh.b_bdev = inode->i_sb->s_bdev; - - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - inode_lock(inode); - - /* Protects against truncate */ - if (!(flags & DIO_SKIP_DIO_COUNT)) - inode_dio_begin(inode); - - retval = dax_io(inode, iter, pos, end, get_block, &bh); - - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - inode_unlock(inode); - - if (end_io) { - int err; - - err = end_io(iocb, pos, retval, bh.b_private); - if (err) - retval = err; - } - - if (!(flags & DIO_SKIP_DIO_COUNT)) - inode_dio_end(inode); - return retval; + return page; } -EXPORT_SYMBOL_GPL(dax_do_io); /* * DAX radix tree locking */ struct exceptional_entry_key { struct address_space *mapping; - unsigned long index; + pgoff_t entry_start; }; struct wait_exceptional_entry_queue { @@ -308,6 +130,26 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; +static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, + pgoff_t index, void *entry, struct exceptional_entry_key *key) +{ + unsigned long hash; + + /* + * If 'entry' is a PMD, align the 'index' that we use for the wait + * queue to the start of that PMD. This ensures that all offsets in + * the range covered by the PMD map to the same bit lock. + */ + if (dax_is_pmd_entry(entry)) + index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); + + key->mapping = mapping; + key->entry_start = index; + + hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); + return wait_table + hash; +} + static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, int sync, void *keyp) { @@ -316,7 +158,7 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, container_of(wait, struct wait_exceptional_entry_queue, wait); if (key->mapping != ewait->key.mapping || - key->index != ewait->key.index) + key->entry_start != ewait->key.entry_start) return 0; return autoremove_wake_function(wait, mode, sync, NULL); } @@ -342,7 +184,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -356,7 +198,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -372,24 +214,24 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) { - void *ret, **slot; + void *entry, **slot; struct wait_exceptional_entry_queue ewait; - wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; - ewait.key.mapping = mapping; - ewait.key.index = index; for (;;) { - ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (!ret || !radix_tree_exceptional_entry(ret) || + if (!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; - return ret; + return entry; } + + wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&mapping->tree_lock); @@ -399,52 +241,173 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, } } +static void dax_unlock_mapping_entry(struct address_space *mapping, + pgoff_t index) +{ + void *entry, **slot; + + spin_lock_irq(&mapping->tree_lock); + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, entry, false); +} + +static void put_locked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) { + unlock_page(entry); + put_page(entry); + } else { + dax_unlock_mapping_entry(mapping, index); + } +} + +/* + * Called when we are done with radix tree entry we looked up via + * get_unlocked_mapping_entry() and which we didn't lock in the end. + */ +static void put_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) + return; + + /* We have to wake up next waiter for the radix tree entry lock */ + dax_wake_mapping_entry_waiter(mapping, index, entry, false); +} + /* * Find radix tree entry at given index. If it points to a page, return with * the page locked. If it points to the exceptional entry, return with the * radix tree entry locked. If the radix tree doesn't contain given index, * create empty exceptional entry for the index and return with it locked. * + * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will + * either return that locked entry or will return an error. This error will + * happen if there are any 4k entries (either zero pages or DAX entries) + * within the 2MiB range that we are requesting. + * + * We always favor 4k entries over 2MiB entries. There isn't a flow where we + * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB + * insertion will fail if it finds any 4k entries already in the tree, and a + * 4k insertion will cause an existing 2MiB entry to be unmapped and + * downgraded to 4k entries. This happens for both 2MiB huge zero pages as + * well as 2MiB empty entries. + * + * The exception to this downgrade path is for 2MiB DAX PMD entries that have + * real storage backing them. We will leave these real 2MiB DAX entries in + * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. + * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. */ -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, + unsigned long size_flag) { - void *ret, **slot; + bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ + void *entry, **slot; restart: spin_lock_irq(&mapping->tree_lock); - ret = get_unlocked_mapping_entry(mapping, index, &slot); + entry = get_unlocked_mapping_entry(mapping, index, &slot); + + if (entry) { + if (size_flag & RADIX_DAX_PMD) { + if (!radix_tree_exceptional_entry(entry) || + dax_is_pte_entry(entry)) { + put_unlocked_mapping_entry(mapping, index, + entry); + entry = ERR_PTR(-EEXIST); + goto out_unlock; + } + } else { /* trying to grab a PTE entry */ + if (radix_tree_exceptional_entry(entry) && + dax_is_pmd_entry(entry) && + (dax_is_zero_entry(entry) || + dax_is_empty_entry(entry))) { + pmd_downgrade = true; + } + } + } + /* No entry for given index? Make sure radix tree is big enough. */ - if (!ret) { + if (!entry || pmd_downgrade) { int err; + if (pmd_downgrade) { + /* + * Make sure 'entry' remains valid while we drop + * mapping->tree_lock. + */ + entry = lock_slot(mapping, slot); + } + spin_unlock_irq(&mapping->tree_lock); + /* + * Besides huge zero pages the only other thing that gets + * downgraded are empty entries which don't need to be + * unmapped. + */ + if (pmd_downgrade && dax_is_zero_entry(entry)) + unmap_mapping_range(mapping, + (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); - if (err) + if (err) { + if (pmd_downgrade) + put_locked_mapping_entry(mapping, index, entry); return ERR_PTR(err); - ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | - RADIX_DAX_ENTRY_LOCK); + } spin_lock_irq(&mapping->tree_lock); - err = radix_tree_insert(&mapping->page_tree, index, ret); + + if (pmd_downgrade) { + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + dax_wake_mapping_entry_waiter(mapping, index, entry, + true); + } + + entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); + + err = __radix_tree_insert(&mapping->page_tree, index, + dax_radix_order(entry), entry); radix_tree_preload_end(); if (err) { spin_unlock_irq(&mapping->tree_lock); - /* Someone already created the entry? */ - if (err == -EEXIST) + /* + * Someone already created the entry? This is a + * normal failure when inserting PMDs in a range + * that already contains PTEs. In that case we want + * to return -EEXIST immediately. + */ + if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD)) goto restart; + /* + * Our insertion of a DAX PMD entry failed, most + * likely because it collided with a PTE sized entry + * at a different index in the PMD range. We haven't + * inserted anything into the radix tree and have no + * waiters to wake. + */ return ERR_PTR(err); } /* Good, we have inserted empty locked entry into the tree. */ mapping->nrexceptional++; spin_unlock_irq(&mapping->tree_lock); - return ret; + return entry; } /* Normal page in radix tree? */ - if (!radix_tree_exceptional_entry(ret)) { - struct page *page = ret; + if (!radix_tree_exceptional_entry(entry)) { + struct page *page = entry; get_page(page); spin_unlock_irq(&mapping->tree_lock); @@ -457,15 +420,26 @@ restart: } return page; } - ret = lock_slot(mapping, slot); + entry = lock_slot(mapping, slot); + out_unlock: spin_unlock_irq(&mapping->tree_lock); - return ret; + return entry; } +/* + * We do not necessarily hold the mapping->tree_lock when we call this + * function so it is possible that 'entry' is no longer a valid item in the + * radix tree. This is okay because all we really need to do is to find the + * correct waitqueue where tasks might be waiting for that old 'entry' and + * wake them. + */ void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, bool wake_all) + pgoff_t index, void *entry, bool wake_all) { - wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + struct exceptional_entry_key key; + wait_queue_head_t *wq; + + wq = dax_entry_waitqueue(mapping, index, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens @@ -473,54 +447,8 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ - if (waitqueue_active(wq)) { - struct exceptional_entry_key key; - - key.mapping = mapping; - key.index = index; + if (waitqueue_active(wq)) __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); - } -} - -void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - void *ret, **slot; - - spin_lock_irq(&mapping->tree_lock); - ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || - !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); - return; - } - unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, false); -} - -static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) -{ - if (!radix_tree_exceptional_entry(entry)) { - unlock_page(entry); - put_page(entry); - } else { - dax_unlock_mapping_entry(mapping, index); - } -} - -/* - * Called when we are done with radix tree entry we looked up via - * get_unlocked_mapping_entry() and which we didn't lock in the end. - */ -static void put_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) -{ - if (!radix_tree_exceptional_entry(entry)) - return; - - /* We have to wake up next waiter for the radix tree entry lock */ - dax_wake_mapping_entry_waiter(mapping, index, false); } /* @@ -547,7 +475,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, true); + dax_wake_mapping_entry_waiter(mapping, index, entry, true); return 1; } @@ -574,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry, /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) { - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + if (!page) return VM_FAULT_OOM; - } vmf->page = page; return VM_FAULT_LOCKED; } @@ -600,11 +526,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size return 0; } -#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) - +/* + * By this point grab_mapping_entry() has ensured that we have a locked entry + * of the appropriate size so we don't have to worry about downgrading PMDs to + * PTEs. If we happen to be trying to insert a PTE and there is a PMD + * already in the tree, we will skip the insertion and just dirty the PMD as + * appropriate. + */ static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, - void *entry, sector_t sector) + void *entry, sector_t sector, + unsigned long flags) { struct radix_tree_root *page_tree = &mapping->page_tree; int error = 0; @@ -627,28 +559,43 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); if (error) return ERR_PTR(error); + } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { + /* replacing huge zero page with PMD block mapping */ + unmap_mapping_range(mapping, + (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); } spin_lock_irq(&mapping->tree_lock); - new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | - RADIX_DAX_ENTRY_LOCK); + new_entry = dax_radix_locked_entry(sector, flags); + if (hole_fill) { __delete_from_page_cache(entry, NULL); /* Drop pagecache reference */ put_page(entry); - error = radix_tree_insert(page_tree, index, new_entry); + error = __radix_tree_insert(page_tree, index, + dax_radix_order(new_entry), new_entry); if (error) { new_entry = ERR_PTR(error); goto unlock; } mapping->nrexceptional++; - } else { + } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + /* + * Only swap our new entry into the radix tree if the current + * entry is a zero page or an empty entry. If a normal PTE or + * PMD entry is already in the tree, we leave it alone. This + * means that if we are trying to insert a PTE and the + * existing entry is a PMD, we will just leave the PMD in the + * tree and dirty it if necessary. + */ + struct radix_tree_node *node; void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); - radix_tree_replace_slot(slot, new_entry); + __radix_tree_replace(page_tree, node, slot, + new_entry, NULL, NULL); } if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); @@ -668,63 +615,150 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, return new_entry; } +static inline unsigned long +pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) +{ + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + return address; +} + +/* Walk all mappings of a given index of a file and writeprotect them */ +static void dax_mapping_entry_mkclean(struct address_space *mapping, + pgoff_t index, unsigned long pfn) +{ + struct vm_area_struct *vma; + pte_t *ptep; + pte_t pte; + spinlock_t *ptl; + bool changed; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { + unsigned long address; + + cond_resched(); + + if (!(vma->vm_flags & VM_SHARED)) + continue; + + address = pgoff_address(index, vma); + changed = false; + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + continue; + if (pfn != pte_pfn(*ptep)) + goto unlock; + if (!pte_dirty(*ptep) && !pte_write(*ptep)) + goto unlock; + + flush_cache_page(vma, address, pfn); + pte = ptep_clear_flush(vma, address, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, address, ptep, pte); + changed = true; +unlock: + pte_unmap_unlock(ptep, ptl); + + if (changed) + mmu_notifier_invalidate_page(vma->vm_mm, address); + } + i_mmap_unlock_read(mapping); +} + static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - int type = RADIX_DAX_TYPE(entry); - struct radix_tree_node *node; struct blk_dax_ctl dax; - void **slot; + void *entry2, **slot; int ret = 0; - spin_lock_irq(&mapping->tree_lock); /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. + * A page got tagged dirty in DAX mapping? Something is seriously + * wrong. */ - if (!__radix_tree_lookup(page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - - /* another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto unlock; + if (WARN_ON(!radix_tree_exceptional_entry(entry))) + return -EIO; - if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { + spin_lock_irq(&mapping->tree_lock); + entry2 = get_unlocked_mapping_entry(mapping, index, &slot); + /* Entry got punched out / reallocated? */ + if (!entry2 || !radix_tree_exceptional_entry(entry2)) + goto put_unlocked; + /* + * Entry got reallocated elsewhere? No need to writeback. We have to + * compare sectors as we must not bail out due to difference in lockbit + * or entry type. + */ + if (dax_radix_sector(entry2) != dax_radix_sector(entry)) + goto put_unlocked; + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || + dax_is_zero_entry(entry))) { ret = -EIO; - goto unlock; + goto put_unlocked; } - dax.sector = RADIX_DAX_SECTOR(entry); - dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); + /* Another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto put_unlocked; + /* Lock the entry to serialize with page faults */ + entry = lock_slot(mapping, slot); + /* + * We can clear the tag now but we have to be careful so that concurrent + * dax_writeback_one() calls for the same index cannot finish before we + * actually flush the caches. This is achieved as the calls will look + * at the entry only under tree_lock and once they do that they will + * see the entry locked and wait for it to unlock. + */ + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); /* + * Even if dax_writeback_mapping_range() was given a wbc->range_start + * in the middle of a PMD, the 'index' we are given will be aligned to + * the start index of the PMD, as will the sector we pull from + * 'entry'. This allows us to flush for PMD_SIZE and not have to + * worry about partial PMD writebacks. + */ + dax.sector = dax_radix_sector(entry); + dax.size = PAGE_SIZE << dax_radix_order(entry); + + /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); - if (ret < 0) + if (ret < 0) { + put_locked_mapping_entry(mapping, index, entry); return ret; + } if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } + dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); wb_cache_pmem(dax.addr, dax.size); - + /* + * After we have flushed the cache, we can clear the dirty tag. There + * cannot be new dirty data in the pfn after the flush has completed as + * the pfn mappings are writeprotected and fault waits for mapping + * entry lock. + */ spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); + put_locked_mapping_entry(mapping, index, entry); return ret; - unlock: + put_unlocked: + put_unlocked_mapping_entry(mapping, index, entry2); spin_unlock_irq(&mapping->tree_lock); return ret; } @@ -738,12 +772,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; - pgoff_t start_index, end_index, pmd_index; + pgoff_t start_index, end_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; - void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; @@ -753,15 +786,6 @@ int dax_writeback_mapping_range(struct address_space *mapping, start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; - pmd_index = DAX_PMD_INDEX(start_index); - - rcu_read_lock(); - entry = radix_tree_lookup(&mapping->page_tree, pmd_index); - rcu_read_unlock(); - - /* see if the start of our range is covered by a PMD entry */ - if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) - start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); @@ -794,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, sector_t sector, size_t size, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; struct blk_dax_ctl dax = { .sector = sector, .size = size, @@ -806,7 +830,7 @@ static int dax_insert_mapping(struct address_space *mapping, return PTR_ERR(dax.addr); dax_unmap_atomic(bdev, &dax); - ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); *entryp = ret; @@ -815,323 +839,6 @@ static int dax_insert_mapping(struct address_space *mapping, } /** - * dax_fault - handle a page fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * fault handler for DAX files. dax_fault() assumes the caller has done all - * the necessary locking for the page fault to proceed successfully. - */ -int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - void *entry; - struct buffer_head bh; - unsigned long vaddr = (unsigned long)vmf->virtual_address; - unsigned blkbits = inode->i_blkbits; - sector_t block; - pgoff_t size; - int error; - int major = 0; - - /* - * Check whether offset isn't beyond end of file now. Caller is supposed - * to hold locks serializing us with truncate / punch hole so this is - * a reliable test. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - return VM_FAULT_SIGBUS; - - memset(&bh, 0, sizeof(bh)); - block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); - bh.b_bdev = inode->i_sb->s_bdev; - bh.b_size = PAGE_SIZE; - - entry = grab_mapping_entry(mapping, vmf->pgoff); - if (IS_ERR(entry)) { - error = PTR_ERR(entry); - goto out; - } - - error = get_block(inode, block, &bh, 0); - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; /* fs corruption? */ - if (error) - goto unlock_entry; - - if (vmf->cow_page) { - struct page *new_page = vmf->cow_page; - if (buffer_written(&bh)) - error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), - bh.b_size, new_page, vaddr); - else - clear_user_highpage(new_page, vaddr); - if (error) - goto unlock_entry; - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - return VM_FAULT_LOCKED; - } - vmf->entry = entry; - return VM_FAULT_DAX_LOCKED; - } - - if (!buffer_mapped(&bh)) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; - if (error) - goto unlock_entry; - } else { - return dax_load_hole(mapping, entry, vmf); - } - } - - /* Filesystem should not return unwritten buffers to us! */ - WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), - bh.b_size, &entry, vma, vmf); - unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); - out: - if (error == -ENOMEM) - return VM_FAULT_OOM | major; - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if ((error < 0) && (error != -EBUSY)) - return VM_FAULT_SIGBUS | major; - return VM_FAULT_NOPAGE | major; -} -EXPORT_SYMBOL_GPL(dax_fault); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below function. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - -static void __dax_dbg(struct buffer_head *bh, unsigned long address, - const char *reason, const char *fn) -{ - if (bh) { - char bname[BDEVNAME_SIZE]; - bdevname(bh->b_bdev, bname); - pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " - "length %zd fallback: %s\n", fn, current->comm, - address, bname, bh->b_state, (u64)bh->b_blocknr, - bh->b_size, reason); - } else { - pr_debug("%s: %s addr: %lx fallback: %s\n", fn, - current->comm, address, reason); - } -} - -#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") - -/** - * dax_pmd_fault - handle a PMD fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * pmd_fault handler for DAX files. - */ -int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct buffer_head bh; - unsigned blkbits = inode->i_blkbits; - unsigned long pmd_addr = address & PMD_MASK; - bool write = flags & FAULT_FLAG_WRITE; - struct block_device *bdev; - pgoff_t size, pgoff; - sector_t block; - int result = 0; - bool alloc = false; - - /* dax pmd mappings require pfn_t_devmap() */ - if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) - return VM_FAULT_FALLBACK; - - /* Fall back to PTEs if we're going to COW */ - if (write && !(vma->vm_flags & VM_SHARED)) { - split_huge_pmd(vma, pmd, address); - dax_pmd_dbg(NULL, address, "cow write"); - return VM_FAULT_FALLBACK; - } - /* If the PMD would extend outside the VMA */ - if (pmd_addr < vma->vm_start) { - dax_pmd_dbg(NULL, address, "vma start unaligned"); - return VM_FAULT_FALLBACK; - } - if ((pmd_addr + PMD_SIZE) > vma->vm_end) { - dax_pmd_dbg(NULL, address, "vma end unaligned"); - return VM_FAULT_FALLBACK; - } - - pgoff = linear_page_index(vma, pmd_addr); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size) - return VM_FAULT_SIGBUS; - /* If the PMD would cover blocks out of the file */ - if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(NULL, address, - "offset + huge page size > file size"); - return VM_FAULT_FALLBACK; - } - - memset(&bh, 0, sizeof(bh)); - bh.b_bdev = inode->i_sb->s_bdev; - block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); - - bh.b_size = PMD_SIZE; - - if (get_block(inode, block, &bh, 0) != 0) - return VM_FAULT_SIGBUS; - - if (!buffer_mapped(&bh) && write) { - if (get_block(inode, block, &bh, 1) != 0) - return VM_FAULT_SIGBUS; - alloc = true; - WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - } - - bdev = bh.b_bdev; - - /* - * If the filesystem isn't willing to tell us the length of a hole, - * just fall back to PTEs. Calling get_block 512 times in a loop - * would be silly. - */ - if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { - dax_pmd_dbg(&bh, address, "allocated block too small"); - return VM_FAULT_FALLBACK; - } - - /* - * If we allocated new storage, make sure no process has any - * zero pages covering this hole - */ - if (alloc) { - loff_t lstart = pgoff << PAGE_SHIFT; - loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ - - truncate_pagecache_range(inode, lstart, lend); - } - - if (!write && !buffer_mapped(&bh)) { - spinlock_t *ptl; - pmd_t entry; - struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm); - - if (unlikely(!zero_page)) { - dax_pmd_dbg(&bh, address, "no zero page"); - goto fallback; - } - - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_none(*pmd)) { - spin_unlock(ptl); - dax_pmd_dbg(&bh, address, "pmd already present"); - goto fallback; - } - - dev_dbg(part_to_dev(bdev->bd_part), - "%s: %s addr: %lx pfn: <zero> sect: %llx\n", - __func__, current->comm, address, - (unsigned long long) to_sector(&bh, inode)); - - entry = mk_pmd(zero_page, vma->vm_page_prot); - entry = pmd_mkhuge(entry); - set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); - result = VM_FAULT_NOPAGE; - spin_unlock(ptl); - } else { - struct blk_dax_ctl dax = { - .sector = to_sector(&bh, inode), - .size = PMD_SIZE, - }; - long length = dax_map_atomic(bdev, &dax); - - if (length < 0) { - dax_pmd_dbg(&bh, address, "dax-error fallback"); - goto fallback; - } - if (length < PMD_SIZE) { - dax_pmd_dbg(&bh, address, "dax-length too small"); - dax_unmap_atomic(bdev, &dax); - goto fallback; - } - if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { - dax_pmd_dbg(&bh, address, "pfn unaligned"); - dax_unmap_atomic(bdev, &dax); - goto fallback; - } - - if (!pfn_t_devmap(dax.pfn)) { - dax_unmap_atomic(bdev, &dax); - dax_pmd_dbg(&bh, address, "pfn not in memmap"); - goto fallback; - } - dax_unmap_atomic(bdev, &dax); - - /* - * For PTE faults we insert a radix tree entry for reads, and - * leave it clean. Then on the first write we dirty the radix - * tree entry via the dax_pfn_mkwrite() path. This sequence - * allows the dax_pfn_mkwrite() call to be simpler and avoid a - * call into get_block() to translate the pgoff to a sector in - * order to be able to create a new radix tree entry. - * - * The PMD path doesn't have an equivalent to - * dax_pfn_mkwrite(), though, so for a read followed by a - * write we traverse all the way through dax_pmd_fault() - * twice. This means we can just skip inserting a radix tree - * entry completely on the initial read and just wait until - * the write to insert a dirty entry. - */ - if (write) { - /* - * We should insert radix-tree entry and dirty it here. - * For now this is broken... - */ - } - - dev_dbg(part_to_dev(bdev->bd_part), - "%s: %s addr: %lx pfn: %lx sect: %llx\n", - __func__, current->comm, address, - pfn_t_to_pfn(dax.pfn), - (unsigned long long) dax.sector); - result |= vmf_insert_pfn_pmd(vma, address, pmd, - dax.pfn, write); - } - - out: - return result; - - fallback: - count_vm_event(THP_FAULT_FALLBACK); - result = VM_FAULT_FALLBACK; - goto out; -} -EXPORT_SYMBOL_GPL(dax_pmd_fault); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -/** * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault @@ -1140,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - void *entry; + void *entry, **slot; pgoff_t index = vmf->pgoff; spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) - goto out; + entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (!entry || !radix_tree_exceptional_entry(entry)) { + if (entry) + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + return VM_FAULT_NOPAGE; + } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - put_unlocked_mapping_entry(mapping, index, entry); -out: + entry = lock_slot(mapping, slot); spin_unlock_irq(&mapping->tree_lock); + /* + * If we race with somebody updating the PTE and finish_mkwrite_fault() + * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry + * the fault in either case. + */ + finish_mkwrite_fault(vmf); + put_locked_mapping_entry(mapping, index, entry); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -1191,62 +908,14 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -/** - * dax_zero_page_range - zero a range within a page of a DAX file - * @inode: The file being truncated - * @from: The file offset that is being truncated to - * @length: The number of bytes to zero - * @get_block: The filesystem method used to translate file offsets to blocks - * - * This function can be called by a filesystem when it is zeroing part of a - * page in a DAX file. This is intended for hole-punch operations. If - * you are truncating a file, the helper function dax_truncate_page() may be - * more convenient. - */ -int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, - get_block_t get_block) -{ - struct buffer_head bh; - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - int err; - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - BUG_ON((offset + length) > PAGE_SIZE); - - memset(&bh, 0, sizeof(bh)); - bh.b_bdev = inode->i_sb->s_bdev; - bh.b_size = PAGE_SIZE; - err = get_block(inode, index, &bh, 0); - if (err < 0 || !buffer_written(&bh)) - return err; - - return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), - offset, length); -} -EXPORT_SYMBOL_GPL(dax_zero_page_range); - -/** - * dax_truncate_page - handle a partial page being truncated in a DAX file - * @inode: The file being truncated - * @from: The file offset that is being truncated to - * @get_block: The filesystem method used to translate file offsets to blocks - * - * Similar to block_truncate_page(), this function can be called by a - * filesystem when it is truncating a DAX file to handle the partial page. - */ -int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +#ifdef CONFIG_FS_IOMAP +static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) { - unsigned length = PAGE_ALIGN(from) - from; - return dax_zero_page_range(inode, from, length, get_block); + return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); } -EXPORT_SYMBOL_GPL(dax_truncate_page); -#ifdef CONFIG_FS_IOMAP static loff_t -iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, +dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) { struct iov_iter *iter = data; @@ -1270,8 +939,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct blk_dax_ctl dax = { 0 }; ssize_t map_len; - dax.sector = iomap->blkno + - (((pos & PAGE_MASK) - iomap->offset) >> 9); + dax.sector = dax_iomap_sector(iomap, pos); dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; map_len = dax_map_atomic(iomap->bdev, &dax); if (map_len < 0) { @@ -1303,7 +971,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } /** - * iomap_dax_rw - Perform I/O to a DAX file + * dax_iomap_rw - Perform I/O to a DAX file * @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system @@ -1313,7 +981,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * and evicting any page cache pages in the region under I/O. */ ssize_t -iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, +dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops) { struct address_space *mapping = iocb->ki_filp->f_mapping; @@ -1343,7 +1011,7 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, while (iov_iter_count(iter)) { ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, - iter, iomap_dax_actor); + iter, dax_iomap_actor); if (ret <= 0) break; pos += ret; @@ -1353,10 +1021,10 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, iocb->ki_pos += done; return done ? done : ret; } -EXPORT_SYMBOL_GPL(iomap_dax_rw); +EXPORT_SYMBOL_GPL(dax_iomap_rw); /** - * iomap_dax_fault - handle a page fault on a DAX file + * dax_iomap_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @ops: iomap ops passed from the file system @@ -1365,17 +1033,18 @@ EXPORT_SYMBOL_GPL(iomap_dax_rw); * or mkwrite handler for DAX files. Assumes the caller has done all the * necessary locking for the page fault to proceed successfully. */ -int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, +int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; - unsigned flags = 0; + unsigned flags = IOMAP_FAULT; int error, major = 0; + int vmf_ret = 0; void *entry; /* @@ -1386,7 +1055,7 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (pos >= i_size_read(inode)) return VM_FAULT_SIGBUS; - entry = grab_mapping_entry(mapping, vmf->pgoff); + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); if (IS_ERR(entry)) { error = PTR_ERR(entry); goto out; @@ -1405,10 +1074,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, goto unlock_entry; if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { error = -EIO; /* fs corruption? */ - goto unlock_entry; + goto finish_iomap; } - sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); + sector = dax_iomap_sector(&iomap, pos); if (vmf->cow_page) { switch (iomap.type) { @@ -1427,13 +1096,13 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } if (error) - goto unlock_entry; - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - return VM_FAULT_LOCKED; - } - vmf->entry = entry; - return VM_FAULT_DAX_LOCKED; + goto finish_iomap; + + __SetPageUptodate(vmf->cow_page); + vmf_ret = finish_fault(vmf); + if (!vmf_ret) + vmf_ret = VM_FAULT_DONE_COW; + goto finish_iomap; } switch (iomap.type) { @@ -1448,8 +1117,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: - if (!(vmf->flags & FAULT_FLAG_WRITE)) - return dax_load_hole(mapping, entry, vmf); + if (!(vmf->flags & FAULT_FLAG_WRITE)) { + vmf_ret = dax_load_hole(mapping, entry, vmf); + break; + } /*FALLTHRU*/ default: WARN_ON_ONCE(1); @@ -1457,15 +1128,218 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, break; } + finish_iomap: + if (ops->iomap_end) { + if (error || (vmf_ret & VM_FAULT_ERROR)) { + /* keep previous error */ + ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, + &iomap); + } else { + error = ops->iomap_end(inode, pos, PAGE_SIZE, + PAGE_SIZE, flags, &iomap); + } + } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + if (vmf_ret != VM_FAULT_LOCKED || error) + put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error < 0 && error != -EBUSY) return VM_FAULT_SIGBUS | major; + if (vmf_ret) { + WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ + return vmf_ret; + } return VM_FAULT_NOPAGE | major; } -EXPORT_SYMBOL_GPL(iomap_dax_fault); +EXPORT_SYMBOL_GPL(dax_iomap_fault); + +#ifdef CONFIG_FS_DAX_PMD +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below functions. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + +static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, + struct vm_fault *vmf, unsigned long address, + struct iomap *iomap, loff_t pos, bool write, void **entryp) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct block_device *bdev = iomap->bdev; + struct blk_dax_ctl dax = { + .sector = dax_iomap_sector(iomap, pos), + .size = PMD_SIZE, + }; + long length = dax_map_atomic(bdev, &dax); + void *ret; + + if (length < 0) /* dax_map_atomic() failed */ + return VM_FAULT_FALLBACK; + if (length < PMD_SIZE) + goto unmap_fallback; + if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) + goto unmap_fallback; + if (!pfn_t_devmap(dax.pfn)) + goto unmap_fallback; + + dax_unmap_atomic(bdev, &dax); + + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, + RADIX_DAX_PMD); + if (IS_ERR(ret)) + return VM_FAULT_FALLBACK; + *entryp = ret; + + return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); + + unmap_fallback: + dax_unmap_atomic(bdev, &dax); + return VM_FAULT_FALLBACK; +} + +static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, + struct vm_fault *vmf, unsigned long address, + struct iomap *iomap, void **entryp) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long pmd_addr = address & PMD_MASK; + struct page *zero_page; + spinlock_t *ptl; + pmd_t pmd_entry; + void *ret; + + zero_page = mm_get_huge_zero_page(vma->vm_mm); + + if (unlikely(!zero_page)) + return VM_FAULT_FALLBACK; + + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, + RADIX_DAX_PMD | RADIX_DAX_HZP); + if (IS_ERR(ret)) + return VM_FAULT_FALLBACK; + *entryp = ret; + + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_none(*pmd)) { + spin_unlock(ptl); + return VM_FAULT_FALLBACK; + } + + pmd_entry = mk_pmd(zero_page, vma->vm_page_prot); + pmd_entry = pmd_mkhuge(pmd_entry); + set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry); + spin_unlock(ptl); + return VM_FAULT_NOPAGE; +} + +int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long pmd_addr = address & PMD_MASK; + bool write = flags & FAULT_FLAG_WRITE; + unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; + struct inode *inode = mapping->host; + int result = VM_FAULT_FALLBACK; + struct iomap iomap = { 0 }; + pgoff_t max_pgoff, pgoff; + struct vm_fault vmf; + void *entry; + loff_t pos; + int error; + + /* Fall back to PTEs if we're going to COW */ + if (write && !(vma->vm_flags & VM_SHARED)) + goto fallback; + + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vma->vm_start) + goto fallback; + if ((pmd_addr + PMD_SIZE) > vma->vm_end) + goto fallback; + + /* + * Check whether offset isn't beyond end of file now. Caller is + * supposed to hold locks serializing us with truncate / punch hole so + * this is a reliable test. + */ + pgoff = linear_page_index(vma, pmd_addr); + max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + + if (pgoff > max_pgoff) + return VM_FAULT_SIGBUS; + + /* If the PMD would extend beyond the file size */ + if ((pgoff | PG_PMD_COLOUR) > max_pgoff) + goto fallback; + + /* + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX + * PMD or a HZP entry. If it can't (because a 4k page is already in + * the tree, for instance), it will return -EEXIST and we just fall + * back to 4k entries. + */ + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto fallback; + + /* + * Note that we don't use iomap_apply here. We aren't doing I/O, only + * setting up a mapping, so really we're using iomap_begin() as a way + * to look up our filesystem block. + */ + pos = (loff_t)pgoff << PAGE_SHIFT; + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); + if (error) + goto unlock_entry; + if (iomap.offset + iomap.length < pos + PMD_SIZE) + goto finish_iomap; + + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; + + switch (iomap.type) { + case IOMAP_MAPPED: + result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, + &iomap, pos, write, &entry); + break; + case IOMAP_UNWRITTEN: + case IOMAP_HOLE: + if (WARN_ON_ONCE(write)) + goto finish_iomap; + result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, + &entry); + break; + default: + WARN_ON_ONCE(1); + break; + } + + finish_iomap: + if (ops->iomap_end) { + if (result == VM_FAULT_FALLBACK) { + ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, + &iomap); + } else { + error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, + iomap_flags, &iomap); + if (error) + result = VM_FAULT_FALLBACK; + } + } + unlock_entry: + put_locked_mapping_entry(mapping, pgoff, entry); + fallback: + if (result == VM_FAULT_FALLBACK) { + split_huge_pmd(vma, pmd, address); + count_vm_event(THP_FAULT_FALLBACK); + } + return result; +} +EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); +#endif /* CONFIG_FS_DAX_PMD */ #endif /* CONFIG_FS_IOMAP */ diff --git a/fs/direct-io.c b/fs/direct-io.c index fb9aa16a7727..aeae8c063451 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -457,7 +457,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) + !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); @@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ -static int sb_init_dio_done_wq(struct super_block *sb) +int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", @@ -843,24 +843,6 @@ out: } /* - * Clean any dirty buffers in the blockdev mapping which alias newly-created - * file blocks. Only called for S_ISREG files - blockdevs do not set - * buffer_new - */ -static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) -{ - unsigned i; - unsigned nblocks; - - nblocks = map_bh->b_size >> dio->inode->i_blkbits; - - for (i = 0; i < nblocks; i++) { - unmap_underlying_metadata(map_bh->b_bdev, - map_bh->b_blocknr + i); - } -} - -/* * If we are not writing the entire block and get_block() allocated * the block for us, we need to fill-in the unused portion of the * block with zeros. This happens only if user-buffer, fileoffset or @@ -960,11 +942,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, goto do_holes; sdio->blocks_available = - map_bh->b_size >> sdio->blkbits; + map_bh->b_size >> blkbits; sdio->next_block_for_io = map_bh->b_blocknr << sdio->blkfactor; - if (buffer_new(map_bh)) - clean_blockdev_aliases(dio, map_bh); + if (buffer_new(map_bh)) { + clean_bdev_aliases( + map_bh->b_bdev, + map_bh->b_blocknr, + map_bh->b_size >> blkbits); + } if (!sdio->blkfactor) goto do_holes; @@ -1209,7 +1195,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->inode = inode; if (iov_iter_rw(iter) == WRITE) { dio->op = REQ_OP_WRITE; - dio->op_flags = WRITE_ODIRECT; + dio->op_flags = REQ_SYNC | REQ_IDLE; } else { dio->op = REQ_OP_READ; } diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index dcea1e37a1b7..07fed838d8fd 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -268,7 +268,7 @@ void dlm_callback_work(struct work_struct *work) int dlm_callback_start(struct dlm_ls *ls) { ls->ls_callback_wq = alloc_workqueue("dlm_callback", - WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + WQ_HIGHPRI | WQ_MEM_RECLAIM, 0); if (!ls->ls_callback_wq) { log_print("can't start dlm_callback workqueue"); return -ENOMEM; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index df955d2209ce..7211e826d90d 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -12,7 +12,7 @@ ******************************************************************************/ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/configfs.h> #include <linux/slab.h> #include <linux/in.h> diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 466f7d60edc2..ca7089aeadab 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -12,7 +12,7 @@ #include <linux/pagemap.h> #include <linux/seq_file.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/slab.h> diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 216b61604ef9..b670f5601fbb 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -18,7 +18,6 @@ * This is the main header file to be included in each DLM source file. */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/types.h> diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index f3e72787e7f9..91592b75c309 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -11,6 +11,8 @@ ******************************************************************************* ******************************************************************************/ +#include <linux/module.h> + #include "dlm_internal.h" #include "lockspace.h" #include "member.h" diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 609998de533e..7d398d300e97 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -519,29 +519,25 @@ out: /* Note: sk_callback_lock must be locked before calling this function. */ static void save_callbacks(struct connection *con, struct sock *sk) { - lock_sock(sk); con->orig_data_ready = sk->sk_data_ready; con->orig_state_change = sk->sk_state_change; con->orig_write_space = sk->sk_write_space; con->orig_error_report = sk->sk_error_report; - release_sock(sk); } static void restore_callbacks(struct connection *con, struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); - lock_sock(sk); sk->sk_user_data = NULL; sk->sk_data_ready = con->orig_data_ready; sk->sk_state_change = con->orig_state_change; sk->sk_write_space = con->orig_write_space; sk->sk_error_report = con->orig_error_report; - release_sock(sk); write_unlock_bh(&sk->sk_callback_lock); } /* Make a socket active */ -static void add_sock(struct socket *sock, struct connection *con) +static void add_sock(struct socket *sock, struct connection *con, bool save_cb) { struct sock *sk = sock->sk; @@ -549,7 +545,7 @@ static void add_sock(struct socket *sock, struct connection *con) con->sock = sock; sk->sk_user_data = con; - if (!test_bit(CF_IS_OTHERCON, &con->flags)) + if (save_cb) save_callbacks(con, sk); /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_data_ready; @@ -806,7 +802,7 @@ static int tcp_accept_from_sock(struct connection *con) newcon->othercon = othercon; othercon->sock = newsock; newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon); + add_sock(newsock, othercon, false); addcon = othercon; } else { @@ -819,7 +815,10 @@ static int tcp_accept_from_sock(struct connection *con) else { newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; - add_sock(newsock, newcon); + /* accept copies the sk after we've saved the callbacks, so we + don't want to save them a second time or comm errors will + result in calling sk_error_report recursively. */ + add_sock(newsock, newcon, false); addcon = newcon; } @@ -880,7 +879,8 @@ static int sctp_accept_from_sock(struct connection *con) } make_sockaddr(&prim.ssp_addr, 0, &addr_len); - if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) { + ret = addr_to_nodeid(&prim.ssp_addr, &nodeid); + if (ret) { unsigned char *b = (unsigned char *)&prim.ssp_addr; log_print("reject connect from unknown addr"); @@ -919,7 +919,7 @@ static int sctp_accept_from_sock(struct connection *con) newcon->othercon = othercon; othercon->sock = newsock; newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon); + add_sock(newsock, othercon, false); addcon = othercon; } else { printk("Extra connection from node %d attempted\n", nodeid); @@ -930,7 +930,7 @@ static int sctp_accept_from_sock(struct connection *con) } else { newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; - add_sock(newsock, newcon); + add_sock(newsock, newcon, false); addcon = newcon; } @@ -1058,7 +1058,7 @@ static void sctp_connect_to_sock(struct connection *con) sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = sctp_connect_to_sock; - add_sock(sock, con); + add_sock(sock, con, true); /* Bind to all addresses. */ if (sctp_bind_addrs(con, 0)) @@ -1146,7 +1146,7 @@ static void tcp_connect_to_sock(struct connection *con) sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = tcp_connect_to_sock; - add_sock(sock, con); + add_sock(sock, con, true); /* Bind to our cluster-known address connecting to avoid routing problems */ @@ -1366,7 +1366,7 @@ static int tcp_listen_for_all(void) sock = tcp_create_listen_sock(con, dlm_local_addr[0]); if (sock) { - add_sock(sock, con); + add_sock(sock, con, true); result = 0; } else { diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 079c0bd71ab7..8e1b618891be 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -11,6 +11,8 @@ ******************************************************************************* ******************************************************************************/ +#include <linux/module.h> + #include "dlm_internal.h" #include "lockspace.h" #include "lock.h" diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 1e6e227134d7..43a96c330570 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -16,11 +16,7 @@ static uint32_t dlm_nl_seqnum; static uint32_t listener_nlportid; -static struct genl_family family = { - .id = GENL_ID_GENERATE, - .name = DLM_GENL_NAME, - .version = DLM_GENL_VERSION, -}; +static struct genl_family family; static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size) { @@ -69,16 +65,24 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info) return 0; } -static struct genl_ops dlm_nl_ops[] = { +static const struct genl_ops dlm_nl_ops[] = { { .cmd = DLM_CMD_HELLO, .doit = user_cmd, }, }; +static struct genl_family family __ro_after_init = { + .name = DLM_GENL_NAME, + .version = DLM_GENL_VERSION, + .ops = dlm_nl_ops, + .n_ops = ARRAY_SIZE(dlm_nl_ops), + .module = THIS_MODULE, +}; + int __init dlm_netlink_init(void) { - return genl_register_family_with_ops(&family, dlm_nl_ops); + return genl_register_family(&family); } void dlm_netlink_exit(void) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 58c2f4a21b7f..1ce908c2232c 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -9,7 +9,6 @@ #include <linux/miscdevice.h> #include <linux/init.h> #include <linux/wait.h> -#include <linux/module.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/poll.h> diff --git a/fs/exec.c b/fs/exec.c index 4e497b9ee71e..8112eacf10f3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * doing the exec and bprm->mm is the new process's mm. */ ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, - &page, NULL); + &page, NULL, NULL); if (ret <= 0) return NULL; @@ -1169,8 +1169,10 @@ no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; +#ifdef CONFIG_POSIX_TIMERS exit_itimers(sig); flush_itimer_signals(); +#endif if (atomic_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; @@ -1275,8 +1277,22 @@ EXPORT_SYMBOL(flush_old_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { - if (inode_permission(file_inode(file), MAY_READ) < 0) + struct inode *inode = file_inode(file); + if (inode_permission(inode, MAY_READ) < 0) { + struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + + /* Ensure mm->user_ns contains the executable */ + user_ns = old = bprm->mm->user_ns; + while ((user_ns != &init_user_ns) && + !privileged_wrt_inode_uidgid(user_ns, inode)) + user_ns = user_ns->parent; + + if (old != user_ns) { + bprm->mm->user_ns = get_user_ns(user_ns); + put_user_ns(old); + } + } } EXPORT_SYMBOL(would_dump); @@ -1306,7 +1322,6 @@ void setup_new_exec(struct linux_binprm * bprm) !gid_eq(bprm->cred->gid, current_egid())) { current->pdeath_signal = 0; } else { - would_dump(bprm, bprm->file); if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) set_dumpable(current->mm, suid_dumpable); } @@ -1406,7 +1421,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) unsigned n_fs; if (p->ptrace) { - if (p->ptrace & PT_PTRACE_CAP) + if (ptracer_capable(p, current_user_ns())) bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; else bprm->unsafe |= LSM_UNSAFE_PTRACE; @@ -1741,6 +1756,8 @@ static int do_execveat_common(int fd, struct filename *filename, if (retval < 0) goto out; + would_dump(bprm, bprm->file); + retval = exec_binprm(bprm); if (retval < 0) goto out; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index a0e1478dfd04..b0f241528a30 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -38,7 +38,7 @@ static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) return 0; /* skip atime */ inode_lock_shared(inode); - ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); + ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); @@ -62,7 +62,7 @@ static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out_unlock; - ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); + ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); mark_inode_dirty(inode); @@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); + ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -107,27 +107,6 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return ret; } -static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) -{ - struct inode *inode = file_inode(vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - int ret; - - if (flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - } - down_read(&ei->dax_sem); - - ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); - - up_read(&ei->dax_sem); - if (flags & FAULT_FLAG_WRITE) - sb_end_pagefault(inode->i_sb); - return ret; -} - static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -154,7 +133,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, - .pmd_fault = ext2_dax_pmd_fault, + /* + * .pmd_fault is not supported for DAX because allocation in ext2 + * cannot be reliably aligned to huge page sizes and so pmd faults + * will always fail and fail back to regular faults. + */ .page_mkwrite = ext2_dax_fault, .pfn_mkwrite = ext2_dax_pfn_mkwrite, }; @@ -166,7 +149,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &ext2_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_MIXEDMAP; return 0; } #else diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 41b8b44a391c..e173afe92661 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -732,16 +732,13 @@ static int ext2_get_blocks(struct inode *inode, } if (IS_DAX(inode)) { - int i; - /* * We must unmap blocks before zeroing so that writeback cannot * overwrite zeros with stale data from block device page cache. */ - for (i = 0; i < count; i++) { - unmap_underlying_metadata(inode->i_sb->s_bdev, - le32_to_cpu(chain[depth-1].key) + i); - } + clean_bdev_aliases(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key), + count); /* * block must be initialised before we put it in the tree * so that it's not found by another thread before it's @@ -850,6 +847,9 @@ struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; +#else +/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ +struct iomap_ops ext2_iomap_ops; #endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -1293,9 +1293,11 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (IS_DAX(inode)) - error = dax_truncate_page(inode, newsize, ext2_get_block); - else if (test_opt(inode->i_sb, NOBH)) + if (IS_DAX(inode)) { + error = iomap_zero_range(inode, newsize, + PAGE_ALIGN(newsize) - newsize, NULL, + &ext2_iomap_ops); + } else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); else diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e38039fd96ff..7b90691e98c4 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -37,6 +37,7 @@ config EXT4_FS select CRC16 select CRYPTO select CRYPTO_CRC32C + select FS_IOMAP if FS_DAX help This is the next generation of the ext3 filesystem. diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dfa519979038..fd389935ecd1 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -196,7 +196,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, error = posix_acl_update_mode(inode, &inode->i_mode, &acl); if (error) return error; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); } break; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a8a750f59621..2163c1e69f2a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -397,8 +397,9 @@ struct flex_groups { #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ @@ -1533,12 +1534,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); } -static inline struct timespec ext4_current_time(struct inode *inode) -{ - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -} - static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -2277,11 +2272,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -static inline int ext4_sb_has_crypto(struct super_block *sb) -{ - return ext4_has_feature_encrypt(sb); -} - static inline bool ext4_encrypted_inode(struct inode *inode) { return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); @@ -2339,8 +2329,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context #define fscrypt_inherit_context fscrypt_notsupp_inherit_context #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info @@ -2458,8 +2448,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, @@ -2492,7 +2480,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); -extern void ext4_truncate(struct inode *); +extern int ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); @@ -3129,7 +3117,7 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_truncate(handle_t *, struct inode *); extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); @@ -3265,12 +3253,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } -static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) -{ - int blksize = 1 << inode->i_blkbits; - - return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); -} +extern struct iomap_ops ext4_iomap_ops; #endif /* __KERNEL__ */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b1d52c14098e..f97611171023 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -414,17 +414,19 @@ static inline int ext4_inode_journal_mode(struct inode *inode) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC)) + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - else - BUG(); + BUG(); } static inline int ext4_should_journal_data(struct inode *inode) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c930a0110fb4..b1f8416923ab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3777,14 +3777,6 @@ out: return err; } -static void unmap_underlying_metadata_blocks(struct block_device *bdev, - sector_t block, int count) -{ - int i; - for (i = 0; i < count; i++) - unmap_underlying_metadata(bdev, block + i); -} - /* * Handle EOFBLOCKS_FL flag, clearing it if necessary */ @@ -4121,9 +4113,8 @@ out: * new. */ if (allocated > map->m_len) { - unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, - newblock + map->m_len, - allocated - map->m_len); + clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len, + allocated - map->m_len); allocated = map->m_len; } map->m_len = allocated; @@ -4631,7 +4622,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(handle_t *handle, struct inode *inode) +int ext4_ext_truncate(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; @@ -4645,7 +4636,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode) /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); + if (err) + return err; last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); @@ -4657,12 +4650,9 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - if (err) { - ext4_std_error(inode->i_sb, err); - return; - } - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); - ext4_std_error(inode->i_sb, err); + if (err) + return err; + return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, @@ -4701,7 +4691,7 @@ retry: /* * Recalculate credits when extent tree depth changes. */ - if (depth >= 0 && depth != ext_depth(inode)) { + if (depth != ext_depth(inode)) { credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); } @@ -4725,7 +4715,7 @@ retry: map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); if (new_size) { if (epos > new_size) epos = new_size; @@ -4853,7 +4843,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); @@ -4878,7 +4868,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_dio; } - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); if (new_size) { ext4_update_inode_size(inode, new_size); } else { @@ -5568,7 +5558,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -5678,7 +5668,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2a822d30e73f..b5f184493c57 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -31,6 +31,42 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_FS_DAX +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + inode_lock_shared(inode); + /* + * Recheck under inode lock - at this point we are sure it cannot + * change anymore + */ + if (!IS_DAX(inode)) { + inode_unlock_shared(inode); + /* Fallback to buffered IO in case we cannot support DAX */ + return generic_file_read_iter(iocb, to); + } + ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} +#endif + +static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + if (!iov_iter_count(to)) + return 0; /* skip atime */ + +#ifdef CONFIG_FS_DAX + if (IS_DAX(file_inode(iocb->ki_filp))) + return ext4_dax_read_iter(iocb, to); +#endif + return generic_file_read_iter(iocb, to); +} + /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release @@ -88,6 +124,86 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) return 0; } +/* Is IO overwriting allocated and initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) +{ + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, blklen; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = pos >> blkbits; + map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); + blklen = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + /* + * 'err==len' means that all of the blocks have been preallocated, + * regardless of whether they have been initialized or not. To exclude + * unwritten extents, we need to check m_flags. + */ + return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + return ret; + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) + return -EFBIG; + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); + } + return iov_iter_count(from); +} + +#ifdef CONFIG_FS_DAX +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + bool overwrite = false; + + inode_lock(inode); + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + ret = file_update_time(iocb->ki_filp); + if (ret) + goto out; + + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + overwrite = true; + downgrade_write(&inode->i_rwsem); + } + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); +out: + if (!overwrite) + inode_unlock(inode); + else + inode_unlock_shared(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +#endif + static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -97,8 +213,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) int overwrite = 0; ssize_t ret; +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + inode_lock(inode); - ret = generic_write_checks(iocb, from); + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -114,53 +235,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_unwritten_wait(inode); } - /* - * If we have encountered a bitmap-format file, the size limit - * is smaller than s_maxbytes, which is for extent-mapped files. - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { - ret = -EFBIG; - goto out; - } - iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); - } - iocb->private = &overwrite; - if (o_direct) { - size_t length = iov_iter_count(from); - loff_t pos = iocb->ki_pos; - - /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - pos + length <= i_size_read(inode)) { - struct ext4_map_blocks map; - unsigned int blkbits = inode->i_blkbits; - int err, len; - - map.m_lblk = pos >> blkbits; - map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); - len = map.m_len; - - err = ext4_map_blocks(NULL, inode, &map, 0); - /* - * 'err==len' means that all of blocks has - * been preallocated no matter they are - * initialized or not. For excluding - * unwritten extents, we need to check - * m_flags. There are two conditions that - * indicate for initialized extents. 1) If we - * hit extent cache, EXT4_MAP_MAPPED flag is - * returned; 2) If we do a real lookup, - * non-flags are returned. So we should check - * these two conditions. - */ - if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) - overwrite = 1; - } - } + /* Check whether we do a DIO overwrite or not */ + if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && + ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) + overwrite = 1; ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); @@ -196,7 +275,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = dax_fault(vma, vmf, ext4_dax_get_block); + result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); if (write) { if (!IS_ERR(handle)) @@ -230,9 +309,10 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; - else - result = dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_get_block); + else { + result = dax_iomap_pmd_fault(vma, addr, pmd, flags, + &ext4_iomap_ops); + } if (write) { if (!IS_ERR(handle)) @@ -687,7 +767,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, - .read_iter = generic_file_read_iter, + .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 170421edfdfe..e57e8d90ea54 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1039,7 +1039,7 @@ got: /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - ext4_current_time(inode); + current_time(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; @@ -1115,8 +1115,7 @@ got: } if (encrypt) { - /* give pointer to avoid set_context with journal ops. */ - err = fscrypt_inherit_context(dir, inode, &encrypt, true); + err = fscrypt_inherit_context(dir, inode, handle, true); if (err) goto fail_free_drop; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f74d5ee2cdec..437df6a1a841 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle, EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + /* + * Propagate changes to inode->i_flags as well - e.g. S_DAX may + * get cleared + */ + ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -336,8 +341,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); - if (!value) + if (!value) { + error = -ENOMEM; goto out; + } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); @@ -442,6 +449,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + /* + * Propagate changes to inode->i_flags as well - e.g. S_DAX may + * get set. + */ + ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -1028,7 +1040,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -1971,7 +1983,7 @@ out: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c064727ed62..88d57af1b516 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> +#include <linux/iomap.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -71,10 +72,9 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, - EXT4_INODE_SIZE(inode->i_sb) - - offset); } + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; @@ -261,8 +261,15 @@ void ext4_evict_inode(struct inode *inode) "couldn't mark inode dirty (err %d)", err); goto stop_handle; } - if (inode->i_blocks) - ext4_truncate(inode); + if (inode->i_blocks) { + err = ext4_truncate(inode); + if (err) { + ext4_error(inode->i_sb, + "couldn't truncate inode %lu (err %d)", + inode->i_ino, err); + goto stop_handle; + } + } /* * ext4_ext_truncate() doesn't reserve any slop when it @@ -654,12 +661,8 @@ found: if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { - ext4_lblk_t i; - - for (i = 0; i < map->m_len; i++) { - unmap_underlying_metadata(inode->i_sb->s_bdev, - map->m_pblk + i); - } + clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, + map->m_len); ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -767,6 +770,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; + } else if (ret == 0) { + /* hole case, need to fill in bh->b_size */ + bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } @@ -1127,8 +1133,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -1166,7 +1171,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = fscrypt_decrypt_page(page); + err = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); return err; } #endif @@ -2360,11 +2366,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) BUG_ON(map->m_len == 0); if (map->m_flags & EXT4_MAP_NEW) { - struct block_device *bdev = inode->i_sb->s_bdev; - int i; - - for (i = 0; i < map->m_len; i++) - unmap_underlying_metadata(bdev, map->m_pblk + i); + clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, + map->m_len); } return 0; } @@ -2891,7 +2894,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb)) { + if (ext4_nonda_switch(inode->i_sb) || + S_ISLNK(inode->i_mode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); @@ -3268,53 +3272,159 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX -/* - * Get block function for DAX IO and mmap faults. It takes care of converting - * unwritten extents to written ones and initializes new / converted blocks - * to zeros. - */ -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) { + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + unsigned long last_block = (offset + length - 1) >> blkbits; + struct ext4_map_blocks map; int ret; - ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); - if (!create) - return _ext4_get_block(inode, iblock, bh_result, 0); + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) - return ret; + map.m_lblk = first_block; + map.m_len = last_block - first_block + 1; - if (buffer_unwritten(bh_result)) { + if (!(flags & IOMAP_WRITE)) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + } else { + int dio_credits; + handle_t *handle; + int retries = 0; + + /* Trim mapping request to maximum we can map at once for DIO */ + if (map.m_len > DIO_MAX_BLOCKS) + map.m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); +retry: /* - * We are protected by i_mmap_sem or i_mutex so we know block - * cannot go away from under us even though we dropped - * i_data_sem. Convert extent to written and write zeros there. + * Either we allocate blocks and then we don't get unwritten + * extent so we have reserved enough credits, or the blocks + * are already allocated and unwritten and in that case + * extent conversion fits in the credits as well. */ - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_CONVERT | - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) { + ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; return ret; + } + + /* + * If we added blocks beyond i_size, we need to make sure they + * will get truncated if we crash before updating i_size in + * ext4_iomap_end(). For faults we don't need to do that (and + * even cannot because for orphan list operations inode_lock is + * required) - if we happen to instantiate block beyond i_size, + * it is because we race with truncate which has already added + * the inode to the orphan list. + */ + if (!(flags & IOMAP_FAULT) && first_block + map.m_len > + (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { + int err; + + err = ext4_orphan_add(handle, inode); + if (err < 0) { + ext4_journal_stop(handle); + return err; + } + } + ext4_journal_stop(handle); } - /* - * At least for now we have to clear BH_New so that DAX code - * doesn't attempt to zero blocks again in a racy way. - */ - clear_buffer_new(bh_result); + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = first_block << blkbits; + + if (ret == 0) { + iomap->type = IOMAP_HOLE; + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->length = (u64)map.m_len << blkbits; + } else { + if (map.m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + } else { + WARN_ON_ONCE(1); + return -EIO; + } + iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9); + iomap->length = (u64)map.m_len << blkbits; + } + + if (map.m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; return 0; } -#else -/* Just define empty function, it will never get called. */ -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) + +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) { - BUG(); - return 0; + int ret = 0; + handle_t *handle; + int blkbits = inode->i_blkbits; + bool truncate = false; + + if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto orphan_del; + } + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + if (iomap->offset + iomap->length > + ALIGN(inode->i_size, 1 << blkbits)) { + ext4_lblk_t written_blk, end_blk; + + written_blk = (offset + written) >> blkbits; + end_blk = (offset + length) >> blkbits; + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + } + /* + * Remove inode from orphan list if we were extending a inode and + * everything went fine. + */ + if (!truncate && inode->i_nlink && + !list_empty(&EXT4_I(inode)->i_orphan)) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + if (truncate) { + ext4_truncate_failed_write(inode); +orphan_del: + /* + * If truncate failed early the inode might still be on the + * orphan list; we need to make sure the inode is removed from + * the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret; } + +struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, + .iomap_end = ext4_iomap_end, +}; + #endif static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -3436,19 +3546,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) iocb->private = NULL; if (overwrite) get_block_func = ext4_dio_get_block_overwrite; - else if (IS_DAX(inode)) { - /* - * We can avoid zeroing for aligned DAX writes beyond EOF. Other - * writes need zeroing either because they can race with page - * faults or because they use partial blocks. - */ - if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size && - ext4_aligned_io(inode, offset, count)) - get_block_func = ext4_dio_get_block; - else - get_block_func = ext4_dax_get_block; - dio_flags = DIO_LOCKING; - } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { get_block_func = ext4_dio_get_block; dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; @@ -3462,14 +3560,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) #ifdef CONFIG_EXT4_FS_ENCRYPTION BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, get_block_func, - ext4_end_io_dio, dio_flags); - } else - ret = __blockdev_direct_IO(iocb, inode, - inode->i_sb->s_bdev, iter, - get_block_func, - ext4_end_io_dio, NULL, dio_flags); + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, + get_block_func, ext4_end_io_dio, NULL, + dio_flags); if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { @@ -3538,6 +3631,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; /* @@ -3546,19 +3640,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) * we are protected against page writeback as well. */ inode_lock_shared(inode); - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); - } else { - size_t count = iov_iter_count(iter); - - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, - NULL, NULL, 0); - } + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count); + if (ret) + goto out_unlock; + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, ext4_dio_get_block, NULL, NULL, 0); out_unlock: inode_unlock_shared(inode); return ret; @@ -3587,6 +3674,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (ext4_has_inline_data(inode)) return 0; + /* DAX uses iomap path now */ + if (WARN_ON_ONCE(IS_DAX(inode))) + return 0; + trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == READ) ret = ext4_direct_IO_read(iocb, iter); @@ -3615,6 +3706,13 @@ static int ext4_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } +static int ext4_set_page_dirty(struct page *page) +{ + WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page)); + WARN_ON_ONCE(!page_has_buffers(page)); + return __set_page_dirty_buffers(page); +} + static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, @@ -3622,6 +3720,7 @@ static const struct address_space_operations ext4_aops = { .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, + .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, @@ -3654,6 +3753,7 @@ static const struct address_space_operations ext4_da_aops = { .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, + .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_da_invalidatepage, .releasepage = ext4_releasepage, @@ -3743,7 +3843,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); - WARN_ON_ONCE(fscrypt_decrypt_page(page)); + WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host, + page, PAGE_SIZE, 0, page->index)); } } if (ext4_should_journal_data(inode)) { @@ -3792,8 +3893,10 @@ static int ext4_block_zero_page_range(handle_t *handle, if (length > max || length < 0) length = max; - if (IS_DAX(inode)) - return dax_zero_page_range(inode, from, length, ext4_get_block); + if (IS_DAX(inode)) { + return iomap_zero_range(inode, from, length, NULL, + &ext4_iomap_ops); + } return __ext4_block_zero_page_range(handle, mapping, from, length); } @@ -4026,7 +4129,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); @@ -4091,10 +4194,11 @@ int ext4_inode_attach_jinode(struct inode *inode) * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ -void ext4_truncate(struct inode *inode) +int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; + int err = 0; handle_t *handle; struct address_space *mapping = inode->i_mapping; @@ -4108,7 +4212,7 @@ void ext4_truncate(struct inode *inode) trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) - return; + return 0; ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); @@ -4120,13 +4224,13 @@ void ext4_truncate(struct inode *inode) ext4_inline_data_truncate(inode, &has_inline); if (has_inline) - return; + return 0; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { if (ext4_inode_attach_jinode(inode) < 0) - return; + return 0; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4135,10 +4239,8 @@ void ext4_truncate(struct inode *inode) credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ext4_std_error(inode->i_sb, PTR_ERR(handle)); - return; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); @@ -4152,7 +4254,8 @@ void ext4_truncate(struct inode *inode) * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ - if (ext4_orphan_add(handle, inode)) + err = ext4_orphan_add(handle, inode); + if (err) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); @@ -4160,11 +4263,13 @@ void ext4_truncate(struct inode *inode) ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(handle, inode); + err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); + if (err) + goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4180,11 +4285,12 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); + return err; } /* @@ -4352,7 +4458,9 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) && + !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) && + !ext4_encrypted_inode(inode)) new_fl |= S_DAX; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); @@ -4411,7 +4519,9 @@ static inline void ext4_iget_extra_inode(struct inode *inode, { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= + EXT4_INODE_SIZE(inode->i_sb) && + *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); ext4_find_inline_data_nolock(inode); } else @@ -4434,6 +4544,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; + loff_t size; int block; uid_t i_uid; gid_t i_gid; @@ -4456,10 +4567,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT4_INODE_SIZE(inode->i_sb)) { - EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", - EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, - EXT4_INODE_SIZE(inode->i_sb)); + EXT4_INODE_SIZE(inode->i_sb) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, + "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } @@ -4534,6 +4647,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); + if ((size = i_size_read(inode)) < 0) { + EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ret = -EFSCORRUPTED; + goto bad_inode; + } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; @@ -4577,6 +4695,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ + BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { @@ -5154,7 +5273,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * update c/mtime in shrink case below */ if (!shrink) { - inode->i_mtime = ext4_current_time(inode); + inode->i_mtime = current_time(inode); inode->i_ctime = inode->i_mtime; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5199,12 +5318,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); - if (shrink) - ext4_truncate(inode); + if (shrink) { + rc = ext4_truncate(inode); + if (rc) + error = rc; + } up_write(&EXT4_I(inode)->i_mmap_sem); } - if (!rc) { + if (!error) { setattr_copy(inode, attr); mark_inode_dirty(inode); } @@ -5216,7 +5338,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); - if (!rc && (ia_valid & ATTR_MODE)) + if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(inode, inode->i_mode); err_out: @@ -5455,18 +5577,20 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) return err; - if (ext4_handle_valid(handle) && - EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { /* - * We need extra buffer credits since we may write into EA block + * In nojournal mode, we can immediately attempt to expand + * the inode. When journaled, we first need to obtain extra + * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ - if ((jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { + if (!ext4_handle_valid(handle) || + jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) { ret = ext4_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); @@ -5620,6 +5744,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); + /* + * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated. + * E.g. S_DAX may get cleared / set. + */ + ext4_set_inode_flags(inode); jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_journal_flag_rwsem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae8ebbc97..49fd1371bfa2 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -153,7 +153,7 @@ static long swap_inode_boot_loader(struct super_block *sb, swap_inode_data(inode, inode_bl); - inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + inode->i_ctime = inode_bl->i_ctime = current_time(inode); spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -191,6 +191,7 @@ journal_err_out: return err; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION static int uuid_is_zero(__u8 u[16]) { int i; @@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16]) return 0; return 1; } +#endif static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) @@ -248,8 +250,11 @@ static int ext4_ioctl_setflags(struct inode *inode, err = -EOPNOTSUPP; goto flags_out; } - } else if (oldflags & EXT4_EOFBLOCKS_FL) - ext4_truncate(inode); + } else if (oldflags & EXT4_EOFBLOCKS_FL) { + err = ext4_truncate(inode); + if (err) + goto flags_out; + } handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { @@ -265,6 +270,9 @@ static int ext4_ioctl_setflags(struct inode *inode, for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; + /* These flags get special treatment later */ + if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) + continue; if (mask & flags) ext4_set_inode_flag(inode, i); else @@ -272,7 +280,7 @@ static int ext4_ioctl_setflags(struct inode *inode, } ext4_set_inode_flags(inode); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: @@ -368,7 +376,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) } EXT4_I(inode)->i_projid = kprojid; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) @@ -409,6 +417,10 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) return xflags; } +#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ + FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + /* Transfer xflags flags to internal */ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) { @@ -453,12 +465,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; + if (flags & ~EXT4_FL_USER_VISIBLE) + return -EOPNOTSUPP; + /* + * chattr(1) grabs flags via GETFLAGS, modifies the result and + * passes that to SETFLAGS. So we cannot easily make SETFLAGS + * more restrictive than just silently masking off visible but + * not settable flags as we always did. + */ + flags &= EXT4_FL_USER_MODIFIABLE; + if (ext4_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + err = mnt_want_write_file(filp); if (err) return err; - flags = ext4_mask_flags(inode->i_mode, flags); - inode_lock(inode); err = ext4_ioctl_setflags(inode, flags); inode_unlock(inode); @@ -500,7 +522,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } @@ -765,28 +787,19 @@ resizefs_out: } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); - case EXT4_IOC_SET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; + case EXT4_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); - if (copy_from_user(&policy, - (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - return fscrypt_process_policy(filp, &policy); -#else - return -EOPNOTSUPP; -#endif - } case EXT4_IOC_GET_ENCRYPTION_PWSALT: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); handle_t *handle; - if (!ext4_sb_has_crypto(sb)) + if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); @@ -816,24 +829,13 @@ resizefs_out: sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; - } - case EXT4_IOC_GET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; - int err = 0; - - if (!ext4_encrypted_inode(inode)) - return -ENOENT; - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; #else return -EOPNOTSUPP; #endif } + case EXT4_IOC_GET_ENCRYPTION_POLICY: + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -865,13 +867,17 @@ resizefs_out: if (!inode_owner_or_capable(inode)) return -EACCES; + if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS) + return -EOPNOTSUPP; + + flags = ext4_xflags_to_iflags(fa.fsx_xflags); + if (ext4_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + err = mnt_want_write_file(filp); if (err) return err; - flags = ext4_xflags_to_iflags(fa.fsx_xflags); - flags = ext4_mask_flags(inode->i_mode, flags); - inode_lock(inode); flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f418f55c2bbe..7ae43c59bc79 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; - unsigned short border; + unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); @@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; - ext4_grpblk_t counters[16]; + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index d89754ef1aab..eb9835638680 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -35,7 +35,7 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) } /* - * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * Write the MMP block using REQ_SYNC to try to get the block on-disk * faster. */ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) @@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { ret = -EIO; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 104f8bfba718..eadba919f26b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1941,7 +1941,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -2987,7 +2987,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_dec_count(handle, dir); ext4_update_dx_flag(dir); @@ -3050,13 +3050,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; - dir->i_ctime = dir->i_mtime = ext4_current_time(dir); + dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); end_unlink: @@ -3254,7 +3254,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_inc_count(handle, inode); ihold(inode); @@ -3381,7 +3381,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, ent->de->file_type = file_type; ent->dir->i_version++; ent->dir->i_ctime = ent->dir->i_mtime = - ext4_current_time(ent->dir); + current_time(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3651,7 +3651,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = ext4_current_time(old.inode); + old.inode->i_ctime = current_time(old.inode); ext4_mark_inode_dirty(handle, old.inode); if (!whiteout) { @@ -3663,9 +3663,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new.inode) { ext4_dec_count(handle, new.inode); - new.inode->i_ctime = ext4_current_time(new.inode); + new.inode->i_ctime = current_time(new.inode); } - old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); + old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); @@ -3723,6 +3723,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; + struct timespec ctime; if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && @@ -3823,8 +3824,9 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = ext4_current_time(old.inode); - new.inode->i_ctime = ext4_current_time(new.inode); + ctime = current_time(old.inode); + old.inode->i_ctime = ctime; + new.inode->i_ctime = ctime; ext4_mark_inode_dirty(handle, old.inode); ext4_mark_inode_dirty(handle, new.inode); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0094923e5ebf..d83b0f3c5fe9 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -340,7 +340,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : 0; + REQ_SYNC : 0; bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); submit_bio(io->io_bio); } @@ -457,7 +457,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } if (buffer_new(bh)) { clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); } set_buffer_async_write(bh); nr_to_submit++; @@ -470,7 +470,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; retry_encrypt: - data_page = fscrypt_encrypt_page(inode, page, gfp_flags); + data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, + page->index, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 52b0530c5d65..dfc8309d7755 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -863,7 +863,6 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_free_rwsem(&sbi->s_journal_flag_rwsem); - brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); @@ -895,6 +894,7 @@ static void ext4_put_super(struct super_block *sb) } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); + brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the @@ -1114,37 +1114,55 @@ static int ext4_prepare_context(struct inode *inode) static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { - handle_t *handle; - int res, res2; + handle_t *handle = fs_data; + int res, res2, retries = 0; + + /* + * If a journal handle was specified, then the encryption context is + * being set on a new inode via inheritance and is part of a larger + * transaction to create the inode. Otherwise the encryption context is + * being set on an existing inode in its own transaction. Only in the + * latter case should the "retry on ENOSPC" logic be used. + */ - /* fs_data is null when internally used. */ - if (fs_data) { - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + if (handle) { + res = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + /* + * Update inode->i_flags - e.g. S_DAX may get disabled + */ + ext4_set_inode_flags(inode); } return res; } +retry: handle = ext4_journal_start(inode, EXT4_HT_MISC, ext4_jbd2_credits_xattr(inode)); if (IS_ERR(handle)) return PTR_ERR(handle); - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + /* Update inode->i_flags - e.g. S_DAX may get disabled */ + ext4_set_inode_flags(inode); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); } res2 = ext4_journal_stop(handle); + + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (!res) res = res2; return res; @@ -1883,12 +1901,6 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && - test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit " - "in data=ordered mode"); - return 0; - } return 1; } @@ -2330,7 +2342,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; - int nr_orphans = 0, nr_truncates = 0; + int ret, nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA int i; #endif @@ -2412,7 +2424,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); + ret = ext4_truncate(inode); + if (ret) + ext4_std_error(inode->i_sb, ret); inode_unlock(inode); nr_truncates++; } else { @@ -3193,10 +3207,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_set_bit(s++, buf); count++; } - for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { - ext4_set_bit(EXT4_B2C(sbi, s++), buf); - count++; + j = ext4_bg_num_gdb(sb, grp); + if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_error(sb, "Invalid number of block group " + "descriptor blocks: %d", j); + j = EXT4_BLOCKS_PER_GROUP(sb) - s; } + count += j; + for (; j > 0; j--) + ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; @@ -3301,7 +3320,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); ext4_fsblk_t block; ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; @@ -3320,16 +3339,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto out_free_orig; + if ((data && !orig_data) || !sbi) + goto out_free_base; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) { - kfree(sbi); - goto out_free_orig; - } + if (!sbi->s_blockgroup_lock) + goto out_free_base; + sb->s_fs_info = sbi; sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; @@ -3475,11 +3492,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, 0)) { - ext4_msg(sb, KERN_WARNING, - "failed to parse options in superblock: %s", - sbi->s_es->s_mount_opts); + if (sbi->s_es->s_mount_opts[0]) { + char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, + sizeof(sbi->s_es->s_mount_opts), + GFP_KERNEL); + if (!s_mount_opts) + goto failed_mount; + if (!parse_options(s_mount_opts, sb, &journal_devnum, + &journal_ioprio, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); + } + kfree(s_mount_opts); } sbi->s_def_mount_opt = sbi->s_mount_opt; if (!parse_options((char *) data, sb, &journal_devnum, @@ -3505,6 +3530,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dax"); goto failed_mount; } + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "encrypted files will use data=ordered " + "instead of data journaling mode"); + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { @@ -3660,12 +3690,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) - goto cantfind_ext4; sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0) goto cantfind_ext4; + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); @@ -3748,13 +3782,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_cluster_ratio = clustersize / blocksize; - if (sbi->s_inodes_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#inodes per group too big: %lu", - sbi->s_inodes_per_group); - goto failed_mount; - } - /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); @@ -3814,6 +3841,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); + if (ext4_has_feature_meta_bg(sb)) { + if (le32_to_cpu(es->s_first_meta_bg) >= db_count) { + ext4_msg(sb, KERN_WARNING, + "first meta block group too large: %u " + "(group descriptor block count %u)", + le32_to_cpu(es->s_first_meta_bg), db_count); + goto failed_mount; + } + } sbi->s_group_desc = ext4_kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); @@ -3967,6 +4003,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) default: break; } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + goto failed_mount_wq; + } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; @@ -4160,7 +4204,9 @@ no_journal: if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + "Opts: %.*s%s%s", descr, + (int) sizeof(sbi->s_es->s_mount_opts), + sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); if (es->s_error_count) @@ -4239,8 +4285,8 @@ failed_mount: out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); +out_free_base: kfree(sbi); -out_free_orig: kfree(orig_data); return err ? err : ret; } @@ -4550,7 +4596,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeinodes_counter)); BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); - lock_buffer(sbh); + if (sync) + lock_buffer(sbh); if (buffer_write_io_error(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -4566,10 +4613,10 @@ static int ext4_commit_super(struct super_block *sb, int sync) set_buffer_uptodate(sbh); } mark_buffer_dirty(sbh); - unlock_buffer(sbh); if (sync) { + unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, - test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC); + test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC); if (error) return error; @@ -4857,6 +4904,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + err = -EINVAL; + goto restore_opts; + } } if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { @@ -5366,7 +5420,7 @@ static int ext4_quota_off(struct super_block *sb, int type) handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto out; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index d77be9e9f535..5a94fa52b74f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -185,6 +185,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, { struct ext4_xattr_entry *e = entry; + /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) @@ -192,15 +193,29 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, e = next; } + /* Check the values */ while (!IS_LAST_ENTRY(entry)) { if (entry->e_value_block != 0) return -EFSCORRUPTED; - if (entry->e_value_size != 0 && - (value_start + le16_to_cpu(entry->e_value_offs) < - (void *)e + sizeof(__u32) || - value_start + le16_to_cpu(entry->e_value_offs) + - le32_to_cpu(entry->e_value_size) > end)) - return -EFSCORRUPTED; + if (entry->e_value_size != 0) { + u16 offs = le16_to_cpu(entry->e_value_offs); + u32 size = le32_to_cpu(entry->e_value_size); + void *value; + + /* + * The value cannot overlap the names, and the value + * with padding cannot extend beyond 'end'. Check both + * the padded and unpadded sizes, since the size may + * overflow to 0 when adding padding. + */ + if (offs > end - value_start) + return -EFSCORRUPTED; + value = value_start + offs; + if (value < (void *)e + sizeof(u32) || + size > end - value || + EXT4_XATTR_SIZE(size) > end - value) + return -EFSCORRUPTED; + } entry = EXT4_XATTR_NEXT(entry); } @@ -231,13 +246,12 @@ static int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { - struct ext4_xattr_entry *entry = IFIRST(header); int error = -EFSCORRUPTED; - if (((void *) header >= end) || + if (end - (void *)header < sizeof(*header) + sizeof(u32) || (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; - error = ext4_xattr_check_names(entry, end, entry); + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); errout: if (error) __ext4_error_inode(inode, function, line, 0, @@ -1109,7 +1123,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, return 0; } -static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, +static int ext4_xattr_ibody_set(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { @@ -1216,7 +1230,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, } if (!value) { if (!is.s.not_found) - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { @@ -1227,7 +1241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); @@ -1242,14 +1256,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; if (!is.s.not_found) { i.value = NULL; - error = ext4_xattr_ibody_set(handle, inode, &i, - &is); + error = ext4_xattr_ibody_set(inode, &i, &is); } } } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); if (!value) ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -1384,7 +1397,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, goto out; /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); + error = ext4_xattr_ibody_set(inode, &i, is); if (error) goto out; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 6fe23af509e1..8f487692c21f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -384,7 +384,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7e9b504bd8b2..f73ee9534d83 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -65,7 +65,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .sbi = sbi, .type = META, .op = REQ_OP_READ, - .op_flags = READ_SYNC | REQ_META | REQ_PRIO, + .op_flags = REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, @@ -160,7 +160,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .sbi = sbi, .type = META, .op = REQ_OP_READ, - .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, + .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; @@ -228,7 +228,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); + ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int f2fs_write_meta_page(struct page *page, @@ -770,7 +770,12 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) /* Sanity checking of checkpoint */ if (sanity_check_ckpt(sbi)) - goto fail_no_cp; + goto free_fail_no_cp; + + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; if (cp_blks <= 1) goto done; @@ -793,6 +798,9 @@ done: f2fs_put_page(cp2, 1); return 0; +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); fail_no_cp: kfree(sbi->ckpt); return -EINVAL; @@ -921,7 +929,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { - update_inode_page(inode); + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + update_inode_page(inode); iput(inode); } }; @@ -987,7 +999,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - build_free_nids(sbi); + build_free_nids(sbi, false); f2fs_unlock_all(sbi); } @@ -998,7 +1010,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&sbi->nr_wb_bios)) + if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; io_schedule_timeout(5*HZ); @@ -1123,7 +1135,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); - start_blk = __start_cp_addr(sbi); + start_blk = __start_cp_next_addr(sbi); /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); @@ -1184,9 +1196,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + __set_cp_next_pack(sbi); /* * redirty superblock if metadata like node page or inode cache is @@ -1261,8 +1273,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - - f2fs_wait_all_discard_bio(sbi); + if (err) { + release_discard_addrs(sbi); + } else { + clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ae194fd2fdb..9ac262564fa6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -29,6 +29,26 @@ #include "trace.h" #include <trace/events/f2fs.h> +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode) || + is_cold_data(page)) + return true; + return false; +} + static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; @@ -71,6 +91,7 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); fscrypt_pullback_bio_page(&page, true); @@ -78,9 +99,11 @@ static void f2fs_write_end_io(struct bio *bio) mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } + dec_page_count(sbi, type); + clear_cold_data(page); end_page_writeback(page); } - if (atomic_dec_and_test(&sbi->nr_wb_bios) && + if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); @@ -88,6 +111,46 @@ static void f2fs_write_end_io(struct bio *bio) } /* + * Return true, if pre_bio's bdev is same as its target device. + */ +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + } + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static bool __same_bdev(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; +} + +/* * Low-level block read/write IO operations. */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, @@ -97,8 +160,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = f2fs_bio_alloc(npages); - bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; @@ -109,8 +171,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { if (!is_read_io(bio_op(bio))) { - atomic_inc(&sbi->nr_wb_bios); - if (f2fs_sb_mounted_hmsmr(sbi->sb) && + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } @@ -198,11 +259,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - if (test_opt(sbi, NOBARRIER)) - io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; - else - io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META | - REQ_PRIO; + io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO; + if (!test_opt(sbi, NOBARRIER)) + io->fio.op_flags |= REQ_FUA; } __submit_merged_bio(io); out: @@ -270,22 +329,24 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + if (!is_read) + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags))) + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || + !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->new_blkaddr, - bio_blocks, is_read); + BIO_MAX_PAGES, is_read); io->fio = *fio; } - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); @@ -483,7 +544,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC, false); + page = get_read_data_page(inode, index, 0, false); if (IS_ERR(page)) return page; @@ -509,7 +570,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC, for_write); + page = get_read_data_page(inode, index, 0, for_write); if (IS_ERR(page)) return page; @@ -590,7 +651,6 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - int seg = CURSEG_WARM_DATA; pgoff_t fofs; blkcnt_t count = 1; @@ -608,11 +668,8 @@ alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) - seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, seg); + &sum, CURSEG_WARM_DATA); set_data_blkaddr(dn); /* update i_size */ @@ -624,11 +681,18 @@ alloc: return 0; } -ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) +static inline bool __force_buffered_io(struct inode *inode, int rw) +{ + return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; - ssize_t ret = 0; + int err = 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); @@ -640,19 +704,22 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + return f2fs_map_blocks(inode, &map, 1, + __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } if (!f2fs_has_inline_data(inode)) return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - return ret; + return err; } /* @@ -676,7 +743,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; struct extent_info ei; - bool allocated = false; block_t blkaddr; if (!maxblocks) @@ -716,7 +782,7 @@ next_dnode: } prealloc = 0; - ofs_in_node = dn.ofs_in_node; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: @@ -735,10 +801,8 @@ next_block: } } else { err = __allocate_data_block(&dn); - if (!err) { + if (!err) set_inode_flag(inode, FI_APPEND_WRITE); - allocated = true; - } } if (err) goto sync_out; @@ -793,7 +857,6 @@ skip: err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; - allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -812,9 +875,8 @@ skip: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } - allocated = false; goto next_dnode; sync_out: @@ -822,7 +884,7 @@ sync_out: unlock_out: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, err); @@ -834,19 +896,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { struct f2fs_map_blocks map; - int ret; + int err; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; - ret = f2fs_map_blocks(inode, &map, create, flag); - if (!ret) { + err = f2fs_map_blocks(inode, &map, create, flag); + if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = map.m_len << inode->i_blkbits; } - return ret; + return err; } static int get_data_block(struct inode *inode, sector_t iblock, @@ -891,7 +953,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head map_bh; sector_t start_blk, last_blk; pgoff_t next_pgofs; - loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; @@ -908,13 +969,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); - isize = i_size_read(inode); - if (start >= isize) - goto out; - - if (start + len > isize) - len = isize - start; - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); @@ -933,13 +987,11 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk = next_pgofs; - /* Go through holes util pass the EOF */ - if (blk_to_logical(inode, start_blk) < isize) + + if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + F2FS_I_SB(inode)->max_file_blocks)) goto prep_next; - /* Found a hole beyond isize means no more extents. - * Note that the premise is that filesystems don't - * punch holes beyond isize and keep size unchanged. - */ + flags |= FIEMAP_EXTENT_LAST; } @@ -982,7 +1034,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fscrypt_ctx *ctx = NULL; - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { @@ -1000,8 +1051,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, fscrypt_release_ctx(ctx); return ERR_PTR(-ENOMEM); } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; @@ -1096,7 +1146,8 @@ got_it: * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != block_nr - 1)) { + if (bio && (last_block_in_bio != block_nr - 1 || + !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; @@ -1195,7 +1246,9 @@ int do_write_data_page(struct f2fs_io_info *fio) fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - gfp_flags); + PAGE_SIZE, 0, + fio->page->index, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); if (err == -ENOMEM) { @@ -1251,7 +1304,7 @@ static int f2fs_write_data_page(struct page *page, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; @@ -1311,7 +1364,6 @@ done: if (err && err != -ENOENT) goto redirty_out; - clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) @@ -1332,6 +1384,8 @@ out: redirty_out: redirty_page_for_writepage(wbc, page); + if (!err) + return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; } @@ -1427,6 +1481,15 @@ continue_unlock: ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + continue; + } done_index = page->index + 1; done = 1; break; @@ -1663,7 +1726,7 @@ repeat: err = PTR_ERR(bio); goto fail; } - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); + bio->bi_opf = REQ_OP_READ; if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); err = -EFAULT; @@ -1714,7 +1777,6 @@ static int f2fs_write_end(struct file *file, goto unlock_out; set_page_dirty(page); - clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); @@ -1751,9 +1813,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (err) return err; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - if (test_opt(F2FS_I_SB(inode), LFS)) + if (__force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); @@ -1785,12 +1845,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, return; if (PageDirty(page)) { - if (inode->i_ino == F2FS_META_INO(sbi)) + if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); - else if (inode->i_ino == F2FS_NODE_INO(sbi)) + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); - else + } else { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } } /* This is atomic written page, keep Private */ diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fb245bd302e4..fbd5184140d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_bios = atomic_read(&sbi->nr_wb_bios); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -74,7 +75,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->fnids = NM_I(sbi)->fcnt; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -194,7 +196,9 @@ get_cache: si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ - si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + + NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); @@ -310,22 +314,22 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", - si->inmem_pages, si->wb_bios); - seq_printf(s, " - nodes: %4lld in %4d\n", + seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", + si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4lld in files:%4d\n", + seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4lld in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - imeta: %4lld\n", + seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d\n", - si->fnids); + seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", + si->free_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -373,6 +377,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct file_operations stat_fops = { + .owner = THIS_MODULE, .open = stat_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 369f4513be37..827c5daef4fc 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, /* show encrypted name */ if (fname->hash) { - if (de->hash_code == fname->hash) + if (de->hash_code == cpu_to_le32(fname->hash)) goto found; } else if (de_name.len == name->len && de->hash_code == namehash && @@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); @@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); @@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); + remove_dirty_inode(dir); } f2fs_put_page(page, 1); } @@ -784,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; @@ -819,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return true; + return err; de_name = *fstr; fstr->len = save_len; @@ -827,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) - return true; + return 1; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return false; + return 0; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) @@ -871,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; continue; - else + } else { goto out; + } } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; @@ -891,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - err = 0; out: fscrypt_fname_free_buffer(&fstr); - return err; + return err < 0 ? err : 0; } static int f2fs_dir_open(struct inode *inode, struct file *filp) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2b06d4fcd954..4db44da7ef69 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode, if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e8de18a168a..2da8c3aa0ce5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -103,7 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_HMSMR 0x0002 +#define F2FS_FEATURE_BLKZONED 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -401,6 +401,7 @@ struct f2fs_map_blocks { #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -413,6 +414,8 @@ struct f2fs_map_blocks { #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define DEF_DIR_LEVEL 0 @@ -428,7 +431,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - struct percpu_counter dirty_pages; /* # of dirty pages */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ @@ -493,20 +496,26 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *); +extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } +enum nid_list { + FREE_NID_LIST, + ALLOC_NID_LIST, + MAX_NID_LIST, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t available_nids; /* maximum available node ids */ + nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ @@ -522,9 +531,9 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head free_nid_list; /* a list for free nids */ - spinlock_t free_nid_list_lock; /* protect free nid list */ - unsigned int fcnt; /* the number of free node id */ + struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ + unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ /* for checkpoint */ @@ -585,7 +594,6 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, - CURSEG_DIRECT_IO, /* to use for the direct IO path */ }; struct flush_cmd { @@ -649,6 +657,7 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -656,6 +665,8 @@ enum count_type { F2FS_DIRTY_META, F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, NR_COUNT_TYPE, }; @@ -688,7 +699,7 @@ struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int op; /* contains REQ_OP_ */ - int op_flags; /* rq_flag_bits */ + int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ @@ -704,6 +715,20 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + u8 *blkz_type; /* Array of zones type */ +#endif +}; + enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ @@ -750,6 +775,12 @@ struct f2fs_sb_info { u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; u8 key_prefix_size; #endif + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ +#endif + /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -764,6 +795,7 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ @@ -815,10 +847,9 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ - struct percpu_counter nr_pages[NR_COUNT_TYPE]; + atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; @@ -863,6 +894,8 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -1105,13 +1138,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) -{ - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - - return blk_queue_discard(q); -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -1232,9 +1258,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_inc(&sbi->nr_pages[count_type]); + atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -1242,14 +1269,14 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { - percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_dec(&sbi->nr_pages[count_type]); + atomic_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1258,19 +1285,19 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); + return atomic_read(&sbi->nr_pages[count_type]); } -static inline s64 get_dirty_pages(struct inode *inode) +static inline int get_dirty_pages(struct inode *inode) { - return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); + return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) @@ -1329,22 +1356,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { - block_t start_addr; - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = cur_cp_version(ckpt); - - start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - /* - * odd numbered checkpoint should at cp segment 0 - * and even segment must be at cp segment 1 - */ - if (!(ckpt_version & 1)) + if (sbi->cur_cp_pack == 2) start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + if (sbi->cur_cp_pack == 1) + start_addr += sbi->blocks_per_seg; return start_addr; } +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); @@ -1621,7 +1653,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1648,7 +1680,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) @@ -1657,7 +1689,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) inc_nlink(inode); else drop_nlink(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, @@ -1668,7 +1700,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode, inode->i_blocks = add ? inode->i_blocks + diff : inode->i_blocks - diff; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1682,34 +1714,27 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode) -{ - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) - return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); -} - static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) @@ -1837,13 +1862,31 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & PAGE_MASK) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); } static inline int f2fs_readonly(struct super_block *sb) @@ -1955,7 +1998,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t); unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, +int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); @@ -1995,7 +2038,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *); +int f2fs_inode_dirtied(struct inode *, bool); void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); @@ -2034,7 +2077,7 @@ void move_node_page(struct page *, int); int fsync_node_pages(struct f2fs_sb_info *, struct inode *, struct writeback_control *, bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *); +void build_free_nids(struct f2fs_sb_info *, bool); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); @@ -2060,7 +2103,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); @@ -2132,12 +2175,15 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); +struct block_device *f2fs_target_device(struct f2fs_sb_info *, + block_t, struct bio *); +int f2fs_target_device_index(struct f2fs_sb_info *, block_t); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); -ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); +int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); @@ -2160,7 +2206,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *, int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool); +int f2fs_gc(struct f2fs_sb_info *, bool, bool); void build_gc_manager(struct f2fs_sb_info *); /* @@ -2181,12 +2227,12 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; - s64 inmem_pages; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, fnids; + int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, wb_bios; + int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; @@ -2412,9 +2458,30 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } -static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); +} + +#ifdef CONFIG_BLK_DEV_ZONED +static inline int get_blkz_type(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return FDEV(i).blkz_type[zno]; + return -EINVAL; +} +#endif + +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) @@ -2453,8 +2520,8 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context #define fscrypt_inherit_context fscrypt_notsupp_inherit_context #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c7865073cd26..49f10dce817d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -94,8 +94,6 @@ mapped: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); - /* if gced page is attached, don't write to cold segment */ - clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); @@ -210,7 +208,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } /* if the inode is dirty, let's recover all the time */ - if (!datasync && !f2fs_skip_inode_update(inode)) { + if (!f2fs_skip_inode_update(inode, datasync)) { f2fs_write_inode(inode, NULL); goto go_write; } @@ -264,7 +262,7 @@ sync_nodes: } if (need_inode_block_update(sbi, ino)) { - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -632,7 +630,7 @@ int f2fs_truncate(struct inode *inode) return err; inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -679,6 +677,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; + bool size_changed = false; err = setattr_prepare(dentry, attr); if (err) @@ -694,7 +693,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is @@ -710,6 +708,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } inode->i_mtime = inode->i_ctime = current_time(inode); } + + size_changed = true; } __setattr_copy(inode, attr); @@ -722,7 +722,12 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - f2fs_mark_inode_dirty_sync(inode); + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, size_changed); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + return err; } @@ -967,7 +972,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, new_size = (dst + i) << PAGE_SHIFT; if (dst_inode->i_size < new_size) f2fs_i_size_write(dst_inode, new_size); - } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); f2fs_put_dnode(&dn); } else { @@ -1218,6 +1223,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); + if (ret) goto out; @@ -1313,15 +1321,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; - int ret; + int err; - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) - return ret; + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; f2fs_balance_fs(sbi, true); @@ -1333,12 +1341,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (ret) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (err) { pgoff_t last_off; if (!map.m_len) - return ret; + return err; last_off = map.m_lblk + map.m_len - 1; @@ -1352,7 +1360,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); - return ret; + return err; } static long f2fs_fallocate(struct file *file, int mode, @@ -1393,7 +1401,9 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1526,7 +1536,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, - "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) @@ -1752,31 +1762,16 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(filp, &policy); + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; - struct inode *inode = file_inode(filp); - int err; - - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1842,7 +1837,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync, true); out: mnt_drop_write_file(filp); return ret; @@ -2256,12 +2251,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) { - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); + int err = f2fs_preallocate_blocks(iocb, from); + + if (err) { + inode_unlock(inode); + return err; } + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); } inode_unlock(inode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6f14ee923acd..88bfc3dff496 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -82,7 +82,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -544,13 +544,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx) +static void move_encrypted_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, .op = REQ_OP_READ, - .op_flags = READ_SYNC, + .op_flags = 0, .encrypted_page = NULL, }; struct dnode_of_data dn; @@ -565,6 +566,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (!page) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -625,7 +629,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) f2fs_wait_on_page_writeback(dn.node_page, NODE, true); fio.op = REQ_OP_WRITE; - fio.op_flags = WRITE_SYNC; + fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); @@ -645,7 +649,8 @@ out: f2fs_put_page(page, 1); } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +static void move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) { struct page *page; @@ -653,6 +658,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) if (IS_ERR(page)) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -663,7 +671,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .sbi = F2FS_I_SB(inode), .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC, + .op_flags = REQ_SYNC, .page = page, .encrypted_page = NULL, }; @@ -673,8 +681,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } set_cold_data(page); @@ -683,8 +693,6 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - - clear_cold_data(page); } out: f2fs_put_page(page, 1); @@ -794,9 +802,9 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx); + move_encrypted_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type); + move_data_page(inode, start_bidx, gc_type, segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); @@ -899,7 +907,7 @@ next: return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) { unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; @@ -940,6 +948,9 @@ gc_more: if (ret) goto stop; } + } else if (gc_type == BG_GC && !background) { + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + goto stop; } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 5f1a67f756af..e32a9e527968 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -111,7 +111,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .sbi = F2FS_I_SB(dn->inode), .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_PRIO, .page = page, .encrypted_page = NULL, }; @@ -137,8 +137,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); - if (dirty) + if (dirty) { inode_dec_dirty_pages(dn->inode); + remove_dirty_inode(dn->inode); + } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); @@ -419,7 +421,7 @@ static int f2fs_add_inline_entries(struct inode *dir, } new_name.name = d.filename[bit_pos]; - new_name.len = de->name_len; + new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; @@ -573,7 +575,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); @@ -610,6 +612,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + int err; if (ctx->pos == NR_INLINE_DENTRY) return 0; @@ -622,11 +625,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); - if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) ctx->pos = NR_INLINE_DENTRY; f2fs_put_page(ipage, 1); - return 0; + return err < 0 ? err : 0; } int f2fs_inline_data_fiemap(struct inode *inode, diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d7369895a78a..af06bda51a54 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -19,10 +19,11 @@ #include <trace/events/f2fs.h> -void f2fs_mark_inode_dirty_sync(struct inode *inode) +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { - if (f2fs_inode_dirtied(inode)) + if (f2fs_inode_dirtied(inode, sync)) return; + mark_inode_dirty_sync(inode); } @@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -252,6 +253,7 @@ retry: int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); @@ -267,11 +269,13 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - if (F2FS_I(inode)->extent_tree) - set_raw_extent(&F2FS_I(inode)->extent_tree->largest, - &ri->i_ext); - else + if (et) { + read_lock(&et->lock); + set_raw_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); @@ -335,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode)) + if (update_inode_page(inode) && wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } @@ -373,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #endif + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); @@ -384,6 +391,8 @@ retry: f2fs_lock_op(sbi); err = remove_inode_page(inode); f2fs_unlock_op(sbi); + if (err == -ENOENT) + err = 0; } /* give more chances, if ENOMEM case */ @@ -403,10 +412,12 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); @@ -424,6 +435,18 @@ void handle_failed_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + update_inode_page(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 489fa0d5f914..db33b5631dc8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -778,7 +778,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); - f2fs_mark_inode_dirty_sync(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -938,7 +938,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(old_dir); + f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -953,7 +953,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(new_dir); + f2fs_mark_inode_dirty_sync(new_dir, false); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 01177ecdeab8..b9078fdb3743 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_SHIFT; + mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> @@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } else { - f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || - nat_get_blkaddr(e) != ne->block_addr || + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); } } @@ -1134,7 +1135,7 @@ repeat: if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); + err = read_node_page(page, 0); if (err < 0) { f2fs_put_page(page, 1); return ERR_PTR(err); @@ -1204,6 +1205,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: @@ -1338,7 +1340,8 @@ retry: if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } if (!IS_DNODE(page) || !is_cold_node(page)) @@ -1407,11 +1410,12 @@ continue_unlock: "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; } - +out: if (nwritten) f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); return ret ? -EIO: 0; @@ -1570,7 +1574,7 @@ static int f2fs_write_node_page(struct page *page, .sbi = sbi, .type = NODE, .op = REQ_OP_WRITE, - .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; @@ -1692,11 +1696,35 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, - struct free_nid *i) +static int __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool new) { + struct f2fs_nm_info *nm_i = NM_I(sbi); + + if (new) { + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; + } + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]++; + list_add_tail(&i->list, &nm_i->nid_list[list]); + return 0; +} + +static void __remove_nid_from_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool reuse) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]--; list_del(&i->list); - radix_tree_delete(&nm_i->free_nid_root, i->nid); + if (!reuse) + radix_tree_delete(&nm_i->free_nid_root, i->nid); } static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) @@ -1704,9 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; - - if (!available_free_memory(sbi, FREE_NIDS)) - return -1; + int err; /* 0 nid should not be used */ if (unlikely(nid == 0)) @@ -1729,33 +1755,30 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) return 0; } - spin_lock(&nm_i->free_nid_list_lock); - if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->free_nid_list_lock); - radix_tree_preload_end(); + spin_lock(&nm_i->nid_list_lock); + err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + if (err) { kmem_cache_free(free_nid_slab, i); return 0; } - list_add_tail(&i->list, &nm_i->free_nid_list); - nm_i->fcnt++; - spin_unlock(&nm_i->free_nid_list_lock); - radix_tree_preload_end(); return 1; } -static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; bool need_free = false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); need_free = true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1778,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) { - if (add_free_nid(sbi, start_nid, true) < 0) - break; - } + if (blk_addr == NULL_ADDR) + add_free_nid(sbi, start_nid, true); } } -void build_free_nids(struct f2fs_sb_info *sbi) +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1794,7 +1815,10 @@ void build_free_nids(struct f2fs_sb_info *sbi) nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) + return; + + if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; /* readahead nat pages to be scanned */ @@ -1830,7 +1854,7 @@ void build_free_nids(struct f2fs_sb_info *sbi) if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else - remove_free_nid(nm_i, nid); + remove_free_nid(sbi, nid); } up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); @@ -1839,6 +1863,13 @@ void build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } +void build_free_nids(struct f2fs_sb_info *sbi, bool sync) +{ + mutex_lock(&NM_I(sbi)->build_lock); + __build_free_nids(sbi, sync); + mutex_unlock(&NM_I(sbi)->build_lock); +} + /* * If this function returns success, caller can obtain a new nid * from second parameter of this function. @@ -1853,31 +1884,31 @@ retry: if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; #endif - if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) - return false; + spin_lock(&nm_i->nid_list_lock); - spin_lock(&nm_i->free_nid_list_lock); + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); - list_for_each_entry(i, &nm_i->free_nid_list, list) - if (i->state == NID_NEW) - break; - - f2fs_bug_on(sbi, i->state != NID_NEW); + if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); + i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + struct free_nid, list); *nid = i->nid; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); return true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - mutex_lock(&nm_i->build_lock); - build_free_nids(sbi); - mutex_unlock(&nm_i->build_lock); + build_free_nids(sbi, true); goto retry; } @@ -1889,11 +1920,11 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); - __del_from_free_nid_list(nm_i, i); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, !i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); } @@ -1910,17 +1941,22 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + if (!available_free_memory(sbi, FREE_NIDS)) { - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); need_free = true; } else { + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); i->state = NID_NEW; - nm_i->fcnt++; + __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } - spin_unlock(&nm_i->free_nid_list_lock); + + nm_i->available_nids++; + + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1932,24 +1968,24 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->fcnt <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], + list) { + if (nr_shrink <= 0 || + nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - if (i->state == NID_ALLOC) - continue; - __del_from_free_nid_list(nm_i, i); + + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); kmem_cache_free(free_nid_slab, i); - nm_i->fcnt--; nr_shrink--; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; @@ -2005,7 +2041,7 @@ recover_xnid: if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); - remove_free_nid(NM_I(sbi), new_xnid); + remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); @@ -2035,7 +2071,7 @@ retry: } /* Should not use this inode from free nid list */ - remove_free_nid(NM_I(sbi), ino); + remove_free_nid(sbi, ino); if (!PageUptodate(ipage)) SetPageUptodate(ipage); @@ -2069,7 +2105,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_node *rn; struct f2fs_summary *sum_entry; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(sbi); int i, idx, last_offset, nrpages; /* scan the node segment */ @@ -2078,7 +2113,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, bio_blocks); + nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ ra_meta_pages(sbi, addr, nrpages, META_POR, true); @@ -2120,6 +2155,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + __set_nat_cache_dirty(nm_i, ne); } update_nats_in_cursum(journal, -i); @@ -2192,8 +2240,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - if (nat_get_blkaddr(ne) == NULL_ADDR) + if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + NM_I(sbi)->available_nids++; + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } if (to_journal) @@ -2268,21 +2320,24 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; - nm_i->fcnt = 0; + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; + nm_i->nid_cnt[FREE_NID_LIST] = 0; + nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); + INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); mutex_init(&nm_i->build_lock); - spin_lock_init(&nm_i->free_nid_list_lock); + spin_lock_init(&nm_i->nid_list_lock); init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); @@ -2310,7 +2365,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi); + build_free_nids(sbi, true); return 0; } @@ -2327,17 +2382,18 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) return; /* destroy free nid list */ - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(sbi, i->state == NID_ALLOC); - __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], + list) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->fcnt); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); + f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); + f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ down_write(&nm_i->nat_tree_lock); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 868bec65e51c..e7997e240366 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - spin_lock(&nm_i->free_nid_list_lock); - if (nm_i->fcnt <= 0) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + struct free_nid, list); *nid = fnid->nid; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); } /* @@ -313,7 +314,7 @@ static inline bool is_recoverable_dnode(struct page *page) ((unsigned char *)ckpt + crc_offset))); cp_ver |= (crc << 32); } - return cpu_to_le64(cp_ver) == cpver_of_node(page); + return cp_ver == cpver_of_node(page); } /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2fc84a991325..981a9584b62f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,13 +180,15 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mode = le16_to_cpu(raw->i_mode); f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + F2FS_I(inode)->i_advise = raw->i_advise; + if (file_enc_name(inode)) name = "<encrypted>"; else @@ -196,32 +198,6 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static bool is_same_inode(struct inode *inode, struct page *ipage) -{ - struct f2fs_inode *ri = F2FS_INODE(ipage); - struct timespec disk; - - if (!IS_INODE(ipage)) - return true; - - disk.tv_sec = le64_to_cpu(ri->i_ctime); - disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - if (timespec_compare(&inode->i_ctime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_atime); - disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - if (timespec_compare(&inode->i_atime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_mtime); - disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); - if (timespec_compare(&inode->i_mtime, &disk) > 0) - return false; - - return true; -} - static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { struct curseg_info *curseg; @@ -248,10 +224,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (!is_same_inode(entry->inode, page)) - goto next; - } else { + if (!entry) { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) @@ -454,7 +427,8 @@ retry_dn: continue; } - if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + if (!file_keep_isize(inode) && + (i_size_read(inode) <= (start << PAGE_SHIFT))) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* @@ -507,8 +481,10 @@ err: f2fs_put_dnode(&dn); out: f2fs_msg(sbi->sb, KERN_NOTICE, - "recover_data: ino = %lx, recovered = %d blocks, err = %d", - inode->i_ino, recovered, err); + "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, + file_keep_isize(inode) ? "keep" : "recover", + recovered, err); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fc886f008449..0738f48293cc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -259,7 +259,7 @@ static int __commit_inmem_pages(struct inode *inode, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_PRIO, .encrypted_page = NULL, }; bool submit_bio = false; @@ -274,8 +274,10 @@ static int __commit_inmem_pages(struct inode *inode, set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } fio.page = page; err = do_write_data_page(&fio); @@ -287,7 +289,6 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - clear_cold_data(page); submit_bio = true; } unlock_page(page); @@ -363,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false); + f2fs_gc(sbi, false, false); } } @@ -380,14 +381,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi); + build_free_nids(sbi, false); + + if (!is_idle(sbi)) + return; /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || !available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || - (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; @@ -400,6 +404,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } +static int __submit_flush_wait(struct block_device *bdev) +{ + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio->bi_bdev = bdev; + ret = submit_bio_wait(bio); + bio_put(bio); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi) +{ + int ret = __submit_flush_wait(sbi->sb->s_bdev); + int i; + + if (sbi->s_ndevs && !ret) { + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(FDEV(i).bdev); + if (ret) + break; + } + } + return ret; +} + static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -410,25 +441,18 @@ repeat: return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio; struct flush_cmd *cmd, *next; int ret; - bio = f2fs_bio_alloc(0); - fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); - ret = submit_bio_wait(bio); - + ret = submit_flush_wait(sbi); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } - bio_put(bio); fcc->dispatch_list = NULL; } @@ -449,15 +473,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - struct bio *bio = f2fs_bio_alloc(0); int ret; atomic_inc(&fcc->submit_flush); - bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); - ret = submit_bio_wait(bio); + ret = submit_flush_wait(sbi); atomic_dec(&fcc->submit_flush); - bio_put(bio); return ret; } @@ -469,8 +489,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); - wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); + } else { + llist_del_all(&fcc->issue_list); + atomic_set(&fcc->submit_flush, 0); + } return cmd.ret; } @@ -481,6 +506,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; + if (SM_I(sbi)->cmd_control_info) { + fcc = SM_I(sbi)->cmd_control_info; + goto init_thread; + } + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; @@ -488,6 +518,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; +init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { @@ -500,14 +531,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; - if (fcc && fcc->f2fs_issue_flush) - kthread_stop(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; + } } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -633,15 +670,23 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio) } /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio = NULL; int err; - err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, - &bio); + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(blkstart), + SECTOR_FROM_BLOCK(blklen), + GFP_NOFS, 0, &bio); if (!err && bio) { struct bio_entry *be = __add_bio_entry(sbi, bio); @@ -654,24 +699,101 @@ int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, return err; } +#ifdef CONFIG_BLK_DEV_ZONED +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); + sector_t sector; + int devi = 0; + + if (sbi->s_ndevs) { + devi = f2fs_target_device_index(sbi, blkstart); + blkstart -= FDEV(devi).start_blk; + } + sector = SECTOR_FROM_BLOCK(blkstart); + + if (sector & (bdev_zone_size(bdev) - 1) || + nr_sects != bdev_zone_size(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } + + /* + * We need to know the type of the zone: for conventional zones, + * use regular discard if the drive supports it. For sequential + * zones, reset the zone write pointer. + */ + switch (get_blkz_type(sbi, bdev, blkstart)) { + + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!blk_queue_discard(bdev_get_queue(bdev))) + return 0; + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + trace_f2fs_issue_reset_zone(sbi->sb, blkstart); + return blkdev_reset_zones(bdev, sector, + nr_sects, GFP_NOFS); + default: + /* Unknown zone type: broken device ? */ + return -EIO; + } +} +#endif + +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb) && + bdev_zoned_model(bdev) != BLK_ZONED_NONE) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); +} + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(blkstart); - sector_t len = SECTOR_FROM_BLOCK(blklen); + sector_t start = blkstart, len = 0; + struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } - for (i = blkstart; i < blkstart + blklen; i++) { se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); - return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0); + + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + return err; } static void __add_discard_entry(struct f2fs_sb_info *sbi, @@ -1296,25 +1418,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int old_segno; - - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); - locate_dirty_segment(sbi, old_segno); -} - void allocate_new_segments(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg; + unsigned int old_segno; int i; if (test_opt(sbi, LFS)) return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segments(sbi, i); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } static const struct segment_allocation default_salloc_ops = { @@ -1448,21 +1566,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); - struct curseg_info *curseg; - bool direct_io = (type == CURSEG_DIRECT_IO); - - type = direct_io ? CURSEG_WARM_DATA : type; - - curseg = CURSEG_I(sbi, type); + struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); - /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0, 0)) - __allocate_new_segments(sbi, type); - *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* @@ -1515,7 +1623,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .sbi = sbi, .type = META, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, @@ -2166,7 +2274,6 @@ out: static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *dst_bitmap; @@ -2233,7 +2340,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; - sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); + sit_i->written_valid_blocks = 0; sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; @@ -2315,10 +2422,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); + readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; @@ -2387,6 +2494,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; } /* set use the current segments */ @@ -2645,7 +2755,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; - destroy_flush_cmd_control(sbi); + destroy_flush_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index fecb856ad874..9d44ce83acb2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) @@ -102,8 +104,6 @@ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) -#define MAX_BIO_BLOCKS(sbi) \ - ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (test_opt(sbi, LFS)) return false; - return free_sections(sbi) <= (node_secs + 2 * dent_secs + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + 1); } @@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - - node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); + (node_secs + 2 * dent_secs + imeta_secs + + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) @@ -695,13 +696,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) return false; } -static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(queue_max_sectors(q)); -} - /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. @@ -719,7 +713,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * MAX_BIO_BLOCKS(sbi); + return 8 * BIO_MAX_PAGES; else return 0; } @@ -736,11 +730,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - + desired = BIO_MAX_PAGES; if (type == NODE) - desired = 2 * max_hw_blocks(sbi); - else - desired = MAX_BIO_BLOCKS(sbi); + desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 46c915425923..5c60fc28ec75 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -21,14 +21,16 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + + return count > 0 ? count : 0; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) - return NM_I(sbi)->fcnt - MAX_FREE_NIDS; - return 0; + long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6132b4ce4e4c..702638e21c76 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -412,14 +412,20 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else { + } else if (!f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); } break; case Opt_nodiscard: + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "discard is required for zoned block devices"); + return -EINVAL; + } clear_opt(sbi, DISCARD); + break; case Opt_noheap: set_opt(sbi, NOHEAP); break; @@ -512,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "adaptive mode is not allowed with " + "zoned block device feature"); + kfree(name); + return -EINVAL; + } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { @@ -558,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { - kmem_cache_free(f2fs_inode_cachep, fi); - return NULL; - } - /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; + atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); @@ -620,24 +629,25 @@ static int f2fs_drop_inode(struct inode *inode) return generic_drop_inode(inode); } -int f2fs_inode_dirtied(struct inode *inode) +int f2fs_inode_dirtied(struct inode *inode, bool sync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; spin_lock(&sbi->inode_lock[DIRTY_META]); if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return 1; + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); } - - set_inode_flag(inode, FI_DIRTY_INODE); - list_add_tail(&F2FS_I(inode)->gdirty_list, + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, &sbi->inode_list[DIRTY_META]); - inc_page_count(sbi, F2FS_DIRTY_IMETA); - stat_inc_dirty_inode(sbi, DIRTY_META); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } spin_unlock(&sbi->inode_lock[DIRTY_META]); - - return 0; + return ret; } void f2fs_inode_synced(struct inode *inode) @@ -649,10 +659,12 @@ void f2fs_inode_synced(struct inode *inode) spin_unlock(&sbi->inode_lock[DIRTY_META]); return; } - list_del_init(&F2FS_I(inode)->gdirty_list); + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } clear_inode_flag(inode, FI_DIRTY_INODE); clear_inode_flag(inode, FI_AUTO_RECOVER); - dec_page_count(sbi, F2FS_DIRTY_IMETA); stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); spin_unlock(&sbi->inode_lock[DIRTY_META]); } @@ -676,7 +688,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); - f2fs_inode_dirtied(inode); + f2fs_inode_dirtied(inode, false); } static void f2fs_i_callback(struct rcu_head *head) @@ -687,20 +699,28 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { - percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - int i; - - for (i = 0; i < NR_COUNT_TYPE; i++) - percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); } +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kfree(FDEV(i).blkz_type); +#endif + } + kfree(sbi->devs); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -738,7 +758,6 @@ static void f2fs_put_super(struct super_block *sb) * In addition, EIO will skip do checkpoint, we need this as well. */ release_ino_entry(sbi, true); - release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); @@ -762,6 +781,8 @@ static void f2fs_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + destroy_device_list(sbi); + destroy_percpu_info(sbi); kfree(sbi); } @@ -789,13 +810,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { - int err; - if (f2fs_readonly(sb)) return 0; - err = f2fs_sync_fs(sb, 1); - return err; + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + return 0; } static int f2fs_unfreeze(struct super_block *sb) @@ -822,7 +847,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = buf->f_files - valid_inode_count(sbi); + buf->f_ffree = min(buf->f_files - valid_node_count(sbi), + buf->f_bavail); buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; @@ -974,7 +1000,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { @@ -1076,8 +1102,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if flush_merge is not passed in mount option. */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { - destroy_flush_cmd_control(sbi); - } else if (!SM_I(sbi)->cmd_control_info) { + clear_opt(sbi, FLUSH_MERGE); + destroy_flush_cmd_control(sbi, false); + } else { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; @@ -1238,7 +1265,7 @@ static int __f2fs_commit_super(struct buffer_head *bh, unlock_buffer(bh); /* it's rare case, we can do fua all the time */ - return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); + return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA); } static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, @@ -1426,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1437,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong layout: check mkfs.f2fs version"); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; @@ -1447,6 +1485,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; + int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1471,6 +1510,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1486,13 +1528,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) static int init_percpu_info(struct f2fs_sb_info *sbi) { - int i, err; - - for (i = 0; i < NR_COUNT_TYPE; i++) { - err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); - if (err) - return err; - } + int err; err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); if (err) @@ -1502,6 +1538,71 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) GFP_KERNEL); } +#ifdef CONFIG_BLK_DEV_ZONED +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) +{ + struct block_device *bdev = FDEV(devi).bdev; + sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t sector = 0; + struct blk_zone *zones; + unsigned int i, nr_zones; + unsigned int n = 0; + int err = -EIO; + + if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + return 0; + + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + return -EINVAL; + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (bdev_zone_size(bdev) - 1)) + FDEV(devi).nr_blkz++; + + FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + if (!FDEV(devi).blkz_type) + return -ENOMEM; + +#define F2FS_REPORT_NR_ZONES 4096 + + zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), + GFP_KERNEL); + if (!zones) + return -ENOMEM; + + /* Get block zones type */ + while (zones && sector < nr_sectors) { + + nr_zones = F2FS_REPORT_NR_ZONES; + err = blkdev_report_zones(bdev, sector, + zones, &nr_zones, + GFP_KERNEL); + if (err) + break; + if (!nr_zones) { + err = -EIO; + break; + } + + for (i = 0; i < nr_zones; i++) { + FDEV(devi).blkz_type[n] = zones[i].type; + sector += zones[i].len; + n++; + } + } + + kfree(zones); + + return err; +} +#endif + /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them @@ -1594,6 +1695,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int i; + + for (i = 0; i < MAX_DEVICES; i++) { + if (!RDEV(i).path[0]) + return 0; + + if (i == 0) { + sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * + MAX_DEVICES, GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + } + + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + return 0; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -1641,6 +1813,18 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device support is not enabled\n"); + goto free_sb_buf; + } +#endif default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1710,6 +1894,13 @@ try_onemore: goto free_meta_inode; } + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to find devices"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -1893,12 +2084,21 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); +free_devices: + destroy_device_list(sbi); kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); @@ -2044,3 +2244,4 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); + diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3e1c0280f866..c47ce2f330a1 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -106,7 +106,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -554,7 +554,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 05713a5da083..ef600591d96f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - if (!list_empty(&wb->b_more_io)) { - trace_writeback_wait(wb, work); - inode = wb_inode(wb->b_more_io.prev); - spin_lock(&inode->i_lock); - spin_unlock(&wb->list_lock); - /* This function drops i_lock... */ - inode_sleep_on_writeback(inode); - spin_lock(&wb->list_lock); - } + trace_writeback_wait(wb, work); + inode = wb_inode(wb->b_more_io.prev); + spin_lock(&inode->i_lock); + spin_unlock(&wb->list_lock); + /* This function drops i_lock... */ + inode_sleep_on_writeback(inode); + spin_lock(&wb->list_lock); } spin_unlock(&wb->list_lock); blk_finish_plug(&plug); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b3ebe512d64c..096f79997f75 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1739,8 +1739,6 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) * This should be done on write(), truncate() and chown(). */ if (!fc->handle_killpriv) { - int kill; - /* * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. @@ -1750,12 +1748,11 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) return ret; attr->ia_mode = inode->i_mode; - kill = should_remove_suid(entry); - if (kill & ATTR_KILL_SUID) { + if (inode->i_mode & S_ISUID) { attr->ia_valid |= ATTR_MODE; attr->ia_mode &= ~S_ISUID; } - if (kill & ATTR_KILL_SGID) { + if ((inode->i_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { attr->ia_valid |= ATTR_MODE; attr->ia_mode &= ~S_ISGID; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 3cdde5f5d399..79113219be5f 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -62,6 +62,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/vmalloc.h> +#include <linux/bio.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index e58ccef09c91..27c00a16def0 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) struct gfs2_log_header *lh; unsigned int tail; u32 hash; - int op_flags = WRITE_FLUSH_FUA | REQ_META; + int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lh = page_address(page); @@ -682,7 +682,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); - op_flags = WRITE_SYNC | REQ_META | REQ_PRIO; + op_flags = REQ_SYNC | REQ_META | REQ_PRIO; } sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 49d5a1b61b06..b1f9144b42c7 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -231,7 +231,7 @@ static void gfs2_end_log_write(struct bio *bio) * gfs2_log_flush_bio - Submit any pending log bio * @sdp: The superblock * @op: REQ_OP - * @op_flags: rq_flag_bits + * @op_flags: req_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 373639a59782..49db8ef13fdf 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_flags = REQ_META | REQ_PRIO | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); + int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -285,7 +284,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } } - gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num); + gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; @@ -453,7 +452,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ff72ac6439c8..a34308df927f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -246,7 +246,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META); + bio_set_op_attrs(bio, REQ_OP_READ, REQ_META); submit_bio(bio); wait_on_page_locked(page); bio_put(bio); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 11854dd84572..67aedf4c2e7c 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, sbi->s_vhdr_buf, NULL, REQ_OP_WRITE, - WRITE_SYNC); + REQ_SYNC); if (!error) error = error2; if (!write_backup) @@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE, - WRITE_SYNC); + REQ_SYNC); if (!error) error2 = error; out: diff --git a/fs/internal.h b/fs/internal.h index f4da3341b4a3..4fcf51766d4a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap_ops *ops, void *data, iomap_actor_t actor); + +/* direct-io.c: */ +int sb_init_dio_done_wq(struct super_block *sb); diff --git a/fs/iomap.c b/fs/iomap.c index a8ee8c33ca78..354a123f170e 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -24,6 +24,7 @@ #include <linux/uio.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +#include <linux/task_io_accounting_ops.h> #include <linux/dax.h> #include "internal.h" @@ -467,8 +468,9 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, offset = page_offset(page); while (length > 0) { - ret = iomap_apply(inode, offset, length, IOMAP_WRITE, - ops, page, iomap_page_mkwrite_actor); + ret = iomap_apply(inode, offset, length, + IOMAP_WRITE | IOMAP_FAULT, ops, page, + iomap_page_mkwrite_actor); if (unlikely(ret <= 0)) goto out_unlock; offset += ret; @@ -583,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); + +/* + * Private flags for iomap_dio, must not overlap with the public ones in + * iomap.h: + */ +#define IOMAP_DIO_WRITE (1 << 30) +#define IOMAP_DIO_DIRTY (1 << 31) + +struct iomap_dio { + struct kiocb *iocb; + iomap_dio_end_io_t *end_io; + loff_t i_size; + loff_t size; + atomic_t ref; + unsigned flags; + int error; + + union { + /* used during submission and for synchronous completion: */ + struct { + struct iov_iter *iter; + struct task_struct *waiter; + struct request_queue *last_queue; + blk_qc_t cookie; + } submit; + + /* used for aio completion: */ + struct { + struct work_struct work; + } aio; + }; +}; + +static ssize_t iomap_dio_complete(struct iomap_dio *dio) +{ + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (dio->end_io) { + ret = dio->end_io(iocb, + dio->error ? dio->error : dio->size, + dio->flags); + } else { + ret = dio->error; + } + + if (likely(!ret)) { + ret = dio->size; + /* check for short read */ + if (iocb->ki_pos + ret > dio->i_size && + !(dio->flags & IOMAP_DIO_WRITE)) + ret = dio->i_size - iocb->ki_pos; + iocb->ki_pos += ret; + } + + inode_dio_end(file_inode(iocb->ki_filp)); + kfree(dio); + + return ret; +} + +static void iomap_dio_complete_work(struct work_struct *work) +{ + struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); + struct kiocb *iocb = dio->iocb; + bool is_write = (dio->flags & IOMAP_DIO_WRITE); + ssize_t ret; + + ret = iomap_dio_complete(dio); + if (is_write && ret > 0) + ret = generic_write_sync(iocb, ret); + iocb->ki_complete(iocb, ret, 0); +} + +/* + * Set an error in the dio if none is set yet. We have to use cmpxchg + * as the submission context and the completion context(s) can race to + * update the error. + */ +static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) +{ + cmpxchg(&dio->error, 0, ret); +} + +static void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_error) + iomap_dio_set_error(dio, bio->bi_error); + + if (atomic_dec_and_test(&dio->ref)) { + if (is_sync_kiocb(dio->iocb)) { + struct task_struct *waiter = dio->submit.waiter; + + WRITE_ONCE(dio->submit.waiter, NULL); + wake_up_process(waiter); + } else if (dio->flags & IOMAP_DIO_WRITE) { + struct inode *inode = file_inode(dio->iocb->ki_filp); + + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + } else { + iomap_dio_complete_work(&dio->aio.work); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); + bio_put(bio); + } +} + +static blk_qc_t +iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, + unsigned len) +{ + struct page *page = ZERO_PAGE(0); + struct bio *bio; + + bio = bio_alloc(GFP_KERNEL, 1); + bio->bi_bdev = iomap->bdev; + bio->bi_iter.bi_sector = + iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + get_page(page); + if (bio_add_page(bio, page, len, 0) != len) + BUG(); + bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + + atomic_inc(&dio->ref); + return submit_bio(bio); +} + +static loff_t +iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct iomap_dio *dio = data; + unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); + unsigned fs_block_size = (1 << inode->i_blkbits), pad; + unsigned align = iov_iter_alignment(dio->submit.iter); + struct iov_iter iter; + struct bio *bio; + bool need_zeroout = false; + int nr_pages, ret; + + if ((pos | length | align) & ((1 << blkbits) - 1)) + return -EINVAL; + + switch (iomap->type) { + case IOMAP_HOLE: + if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) + return -EIO; + /*FALLTHRU*/ + case IOMAP_UNWRITTEN: + if (!(dio->flags & IOMAP_DIO_WRITE)) { + iov_iter_zero(length, dio->submit.iter); + dio->size += length; + return length; + } + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + break; + case IOMAP_MAPPED: + if (iomap->flags & IOMAP_F_SHARED) + dio->flags |= IOMAP_DIO_COW; + if (iomap->flags & IOMAP_F_NEW) + need_zeroout = true; + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + + /* + * Operate on a partial iter trimmed to the extent we were called for. + * We'll update the iter in the dio once we're done with this extent. + */ + iter = *dio->submit.iter; + iov_iter_truncate(&iter, length); + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + if (nr_pages <= 0) + return nr_pages; + + if (need_zeroout) { + /* zero out from the start of the block to the write offset */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos - pad, pad); + } + + do { + if (dio->error) + return 0; + + bio = bio_alloc(GFP_KERNEL, nr_pages); + bio->bi_bdev = iomap->bdev; + bio->bi_iter.bi_sector = + iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + ret = bio_iov_iter_get_pages(bio, &iter); + if (unlikely(ret)) { + bio_put(bio); + return ret; + } + + if (dio->flags & IOMAP_DIO_WRITE) { + bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + task_io_account_write(bio->bi_iter.bi_size); + } else { + bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); + } + + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + + atomic_inc(&dio->ref); + + dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.cookie = submit_bio(bio); + } while (nr_pages); + + if (need_zeroout) { + /* zero out from the end of the write to the end of the block */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + } + + iov_iter_advance(dio->submit.iter, length); + return length; +} + +ssize_t +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops, + iomap_dio_end_io_t end_io) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = file_inode(iocb->ki_filp); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0; + unsigned int flags = IOMAP_DIRECT; + struct blk_plug plug; + struct iomap_dio *dio; + + lockdep_assert_held(&inode->i_rwsem); + + if (!count) + return 0; + + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + if (!dio) + return -ENOMEM; + + dio->iocb = iocb; + atomic_set(&dio->ref, 1); + dio->size = 0; + dio->i_size = i_size_read(inode); + dio->end_io = end_io; + dio->error = 0; + dio->flags = 0; + + dio->submit.iter = iter; + if (is_sync_kiocb(iocb)) { + dio->submit.waiter = current; + dio->submit.cookie = BLK_QC_T_NONE; + dio->submit.last_queue = NULL; + } + + if (iov_iter_rw(iter) == READ) { + if (pos >= dio->i_size) + goto out_free_dio; + + if (iter->type == ITER_IOVEC) + dio->flags |= IOMAP_DIO_DIRTY; + } else { + dio->flags |= IOMAP_DIO_WRITE; + flags |= IOMAP_WRITE; + } + + if (mapping->nrpages) { + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); + if (ret) + goto out_free_dio; + + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; + } + + inode_dio_begin(inode); + + blk_start_plug(&plug); + do { + ret = iomap_apply(inode, pos, count, flags, ops, dio, + iomap_dio_actor); + if (ret <= 0) { + /* magic error code to fall back to buffered I/O */ + if (ret == -ENOTBLK) + ret = 0; + break; + } + pos += ret; + } while ((count = iov_iter_count(iter)) > 0); + blk_finish_plug(&plug); + + if (ret < 0) + iomap_dio_set_error(dio, ret); + + if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && + !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + iomap_dio_set_error(dio, ret); + } + + if (!atomic_dec_and_test(&dio->ref)) { + if (!is_sync_kiocb(iocb)) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->submit.waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !dio->submit.last_queue || + !blk_mq_poll(dio->submit.last_queue, + dio->submit.cookie)) + io_schedule(); + } + __set_current_state(TASK_RUNNING); + } + + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (iov_iter_rw(iter) == WRITE && mapping->nrpages) { + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + + return iomap_dio_complete(dio); + +out_free_dio: + kfree(dio); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 44af14b2e916..9bb2fe35799d 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/init.h> +#include <linux/bio.h> #include <linux/vmalloc.h> #include <linux/zlib.h> diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 98b3eb7d8eaf..0ec137310320 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -377,9 +377,9 @@ repeat: { int p; for (p = 0; p < rr->u.ER.len_id; p++) - printk("%c", rr->u.ER.data[p]); + printk(KERN_CONT "%c", rr->u.ER.data[p]); } - printk("\n"); + printk(KERN_CONT "\n"); break; case SIG('P', 'X'): inode->i_mode = isonum_733(rr->u.PX.mode); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 684996c8a3a4..4055f51617ef 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -186,7 +186,7 @@ __flush_batch(journal_t *journal, int *batch_count) blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); + write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC); blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 31f8ca046639..8c514367ba5a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -155,9 +155,10 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !jbd2_has_feature_async_commit(journal)) - ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh); + ret = submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh); else - ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); + ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); *cbh = bh; return ret; @@ -402,7 +403,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, - WRITE_SYNC); + REQ_SYNC); mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); @@ -717,7 +718,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); + submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); } cond_resched(); stats.run.rs_blocks_logged += bufs; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 927da4956a89..8ed971eeab44 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -913,7 +913,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); if (ret) goto out; @@ -1306,7 +1306,7 @@ static int journal_reset(journal_t *journal) /* Lock here to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); /* - * Update log tail information. We use WRITE_FUA since new + * Update log tail information. We use REQ_FUA since new * transaction will start reusing journal space and so we * must make sure information about current log tail is on * disk before that. @@ -1314,7 +1314,7 @@ static int journal_reset(journal_t *journal) jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, - WRITE_FUA); + REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); @@ -1454,7 +1454,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_FUA); + jbd2_write_superblock(journal, REQ_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -1720,7 +1720,8 @@ int jbd2_journal_destroy(journal_t *journal) ++journal->j_transaction_sequence; write_unlock(&journal->j_state_lock); - jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA); + jbd2_mark_journal_empty(journal, + REQ_PREFLUSH | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1979,7 +1980,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal, WRITE_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2025,7 +2026,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal, WRITE_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 91171dc352cb..cfc38b552118 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -648,7 +648,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, WRITE_SYNC); + write_dirty_buffer(descriptor, REQ_SYNC); } #endif diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 8653cac7e12e..b6fd1ff29ddf 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -121,7 +121,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) jfs_set_inode_flags(inode); inode_unlock(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); setflags_out: mnt_drop_write_file(filp); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a21ea8b3e5fa..bb1da1feafeb 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2002,7 +2002,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); + bio->bi_opf = REQ_OP_READ; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; @@ -2146,7 +2146,7 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; /* check if journaling to disk has been disabled */ if (log->no_integrity) { diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index a1982118f92f..ac9e108ce1ea 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -335,7 +335,7 @@ static int kernfs_xattr_set(const struct xattr_handler *handler, return simple_xattr_set(&attrs->xattrs, name, value, size, flags); } -const struct xattr_handler kernfs_trusted_xattr_handler = { +static const struct xattr_handler kernfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = kernfs_xattr_get, .set = kernfs_xattr_set, @@ -372,7 +372,7 @@ static int kernfs_security_xattr_set(const struct xattr_handler *handler, return error; } -const struct xattr_handler kernfs_security_xattr_handler = { +static const struct xattr_handler kernfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = kernfs_xattr_get, .set = kernfs_security_xattr_set, diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 5426189406c1..fb8cac88251a 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -15,6 +15,6 @@ struct lockd_net { struct list_head nsm_handles; }; -extern int lockd_net_id; +extern unsigned int lockd_net_id; #endif diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index fc4084ef4736..1c13dd80744f 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -57,7 +57,7 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; -int lockd_net_id; +unsigned int lockd_net_id; /* * These can be set at insmod time (useful for NFS as root filesystem), diff --git a/fs/mbcache.c b/fs/mbcache.c index c5bd19ffa326..b19be429d655 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -29,7 +29,7 @@ struct mb_cache { /* log2 of hash table size */ int c_bucket_bits; /* Maximum entries in cache to avoid degrading hash too much */ - int c_max_entries; + unsigned long c_max_entries; /* Protects c_list, c_entry_count */ spinlock_t c_list_lock; struct list_head c_list; @@ -43,7 +43,7 @@ struct mb_cache { static struct kmem_cache *mb_entry_cache; static unsigned long mb_cache_shrink(struct mb_cache *cache, - unsigned int nr_to_scan); + unsigned long nr_to_scan); static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, u32 key) @@ -155,12 +155,12 @@ out: } /* - * mb_cache_entry_find_first - find the first entry in cache with given key + * mb_cache_entry_find_first - find the first reusable entry with the given key * @cache: cache where we should search * @key: key to look for * - * Search in @cache for entry with key @key. Grabs reference to the first - * entry found and returns the entry. + * Search in @cache for a reusable entry with key @key. Grabs reference to the + * first reusable entry found and returns the entry. */ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key) @@ -170,14 +170,14 @@ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, EXPORT_SYMBOL(mb_cache_entry_find_first); /* - * mb_cache_entry_find_next - find next entry in cache with the same + * mb_cache_entry_find_next - find next reusable entry with the same key * @cache: cache where we should search * @entry: entry to start search from * - * Finds next entry in the hash chain which has the same key as @entry. - * If @entry is unhashed (which can happen when deletion of entry races - * with the search), finds the first entry in the hash chain. The function - * drops reference to @entry and returns with a reference to the found entry. + * Finds next reusable entry in the hash chain which has the same key as @entry. + * If @entry is unhashed (which can happen when deletion of entry races with the + * search), finds the first reusable entry in the hash chain. The function drops + * reference to @entry and returns with a reference to the found entry. */ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry) @@ -274,11 +274,11 @@ static unsigned long mb_cache_count(struct shrinker *shrink, /* Shrink number of entries in cache */ static unsigned long mb_cache_shrink(struct mb_cache *cache, - unsigned int nr_to_scan) + unsigned long nr_to_scan) { struct mb_cache_entry *entry; struct hlist_bl_head *head; - unsigned int shrunk = 0; + unsigned long shrunk = 0; spin_lock(&cache->c_list_lock); while (nr_to_scan-- && !list_empty(&cache->c_list)) { @@ -286,7 +286,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, struct mb_cache_entry, e_list); if (entry->e_referenced) { entry->e_referenced = 0; - list_move_tail(&cache->c_list, &entry->e_list); + list_move_tail(&entry->e_list, &cache->c_list); continue; } list_del_init(&entry->e_list); @@ -316,10 +316,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - int nr_to_scan = sc->nr_to_scan; struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); - return mb_cache_shrink(cache, nr_to_scan); + return mb_cache_shrink(cache, sc->nr_to_scan); } /* We shrink 1/X of the cache when we have too many entries in it */ @@ -341,11 +340,8 @@ static void mb_cache_shrink_worker(struct work_struct *work) struct mb_cache *mb_cache_create(int bucket_bits) { struct mb_cache *cache; - int bucket_count = 1 << bucket_bits; - int i; - - if (!try_module_get(THIS_MODULE)) - return NULL; + unsigned long bucket_count = 1UL << bucket_bits; + unsigned long i; cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) @@ -377,7 +373,6 @@ struct mb_cache *mb_cache_create(int bucket_bits) return cache; err_out: - module_put(THIS_MODULE); return NULL; } EXPORT_SYMBOL(mb_cache_create); @@ -411,7 +406,6 @@ void mb_cache_destroy(struct mb_cache *cache) } kfree(cache->c_hash); kfree(cache); - module_put(THIS_MODULE); } EXPORT_SYMBOL(mb_cache_destroy); @@ -420,7 +414,8 @@ static int __init mbcache_init(void) mb_entry_cache = kmem_cache_create("mbcache", sizeof(struct mb_cache_entry), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - BUG_ON(!mb_entry_cache); + if (!mb_entry_cache) + return -ENOMEM; return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index d2413af0823a..28af984a3d96 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; - int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); + int op_flags = wbc_to_write_flags(wbc); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -555,8 +555,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; if (buffer_new(&map_bh)) - unmap_underlying_metadata(map_bh.b_bdev, - map_bh.b_blocknr); + clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { boundary_block = map_bh.b_blocknr; boundary_bdev = map_bh.b_bdev; @@ -705,7 +704,7 @@ mpage_writepages(struct address_space *mapping, ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) { int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : 0); + REQ_SYNC : 0); mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } } @@ -726,7 +725,7 @@ int mpage_writepage(struct page *page, get_block_t get_block, int ret = __mpage_writepage(page, wbc, &mpd); if (mpd.bio) { int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : 0); + REQ_SYNC : 0); mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } return ret; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index e9aa235e9d10..f073a6d2c6a5 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -110,20 +110,52 @@ out: #if defined(CONFIG_NFS_V4_1) /* - * Lookup a layout by filehandle. + * Lookup a layout inode by stateid * - * Note: gets a refcount on the layout hdr and on its respective inode. - * Caller must put the layout hdr and the inode. + * Note: returns a refcount on the inode and superblock + */ +static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp, + const nfs4_stateid *stateid) +{ + struct nfs_server *server; + struct inode *inode; + struct pnfs_layout_hdr *lo; + +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (stateid != NULL && + !nfs4_stateid_match_other(stateid, &lo->plh_stateid)) + continue; + inode = igrab(lo->plh_inode); + if (!inode) + continue; + if (!nfs_sb_active(inode->i_sb)) { + rcu_read_lock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); + goto restart; + } + return inode; + } + } + + return NULL; +} + +/* + * Lookup a layout inode by filehandle. + * + * Note: returns a refcount on the inode and superblock * - * TODO: keep track of all layouts (and delegations) in a hash table - * hashed by filehandle. */ -static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, - struct nfs_fh *fh) +static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp, + const struct nfs_fh *fh) { struct nfs_server *server; struct nfs_inode *nfsi; - struct inode *ino; + struct inode *inode; struct pnfs_layout_hdr *lo; restart: @@ -134,37 +166,38 @@ restart: continue; if (nfsi->layout != lo) continue; - ino = igrab(lo->plh_inode); - if (!ino) - break; - spin_lock(&ino->i_lock); - /* Is this layout in the process of being freed? */ - if (nfsi->layout != lo) { - spin_unlock(&ino->i_lock); - iput(ino); + inode = igrab(lo->plh_inode); + if (!inode) + continue; + if (!nfs_sb_active(inode->i_sb)) { + rcu_read_lock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); goto restart; } - pnfs_get_layout_hdr(lo); - spin_unlock(&ino->i_lock); - return lo; + return inode; } } return NULL; } -static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, - struct nfs_fh *fh) +static struct inode *nfs_layout_find_inode(struct nfs_client *clp, + const struct nfs_fh *fh, + const nfs4_stateid *stateid) { - struct pnfs_layout_hdr *lo; + struct inode *inode; spin_lock(&clp->cl_lock); rcu_read_lock(); - lo = get_layout_by_fh_locked(clp, fh); + inode = nfs_layout_find_inode_by_stateid(clp, stateid); + if (!inode) + inode = nfs_layout_find_inode_by_fh(clp, fh); rcu_read_unlock(); spin_unlock(&clp->cl_lock); - return lo; + return inode; } /* @@ -213,18 +246,20 @@ static u32 initiate_file_draining(struct nfs_client *clp, u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); - lo = get_layout_by_fh(clp, &args->cbl_fh); - if (!lo) { - trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, - &args->cbl_stateid, -rv); + ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid); + if (!ino) goto out; - } - ino = lo->plh_inode; pnfs_layoutcommit_inode(ino, false); spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + if (!lo) { + spin_unlock(&ino->i_lock); + goto out; + } + pnfs_get_layout_hdr(lo); rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); if (rv != NFS_OK) goto unlock; @@ -258,10 +293,10 @@ unlock: /* Free all lsegs that are attached to commit buckets */ nfs_commit_inode(ino, 0); pnfs_put_layout_hdr(lo); +out: trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, &args->cbl_stateid, -rv); - iput(ino); -out: + nfs_iput_and_deactive(ino); return rv; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ebecfb8fba06..91a8d610ba0f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -369,9 +369,7 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ -struct nfs_client * -nfs_get_client(const struct nfs_client_initdata *cl_init, - rpc_authflavor_t authflavour) +struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp, *new = NULL; struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); @@ -655,7 +653,7 @@ static int nfs_init_server(struct nfs_server *server, set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX); + clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index dff600ae0d74..d7df5e67b0c1 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -391,10 +391,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; - /* Ensure we revalidate the attributes and page cache! */ - spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_REVAL_FORCED; - spin_unlock(&inode->i_lock); trace_nfs4_set_delegation(inode, res->delegation_type); out: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5f1af4cd1a33..cb22a9f9ae7e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -455,14 +455,17 @@ bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) } /* - * This function is called by the lookup code to request the use of - * readdirplus to accelerate any future lookups in the same + * This function is called by the lookup and getattr code to request the + * use of readdirplus to accelerate any future lookups in the same * directory. */ -static void nfs_advise_use_readdirplus(struct inode *dir) { - set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); + struct nfs_inode *nfsi = NFS_I(dir); + + if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && + !list_empty(&nfsi->open_files)) + set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); } /* @@ -475,9 +478,12 @@ void nfs_advise_use_readdirplus(struct inode *dir) */ void nfs_force_use_readdirplus(struct inode *dir) { - if (!list_empty(&NFS_I(dir)->open_files)) { - nfs_advise_use_readdirplus(dir); - nfs_zap_mapping(dir, dir->i_mapping); + struct nfs_inode *nfsi = NFS_I(dir); + + if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && + !list_empty(&nfsi->open_files)) { + set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); + invalidate_mapping_pages(dir->i_mapping, 0, -1); } } @@ -886,17 +892,6 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) goto out; } -static bool nfs_dir_mapping_need_revalidate(struct inode *dir) -{ - struct nfs_inode *nfsi = NFS_I(dir); - - if (nfs_attribute_cache_expired(dir)) - return true; - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - return true; - return false; -} - /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. @@ -928,7 +923,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; - if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) + if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; @@ -1035,8 +1030,6 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, int rcu_walk) { - int ret; - if (IS_ROOT(dentry)) return 1; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) @@ -1044,12 +1037,12 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ - if (rcu_walk) - ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir); - else - ret = nfs_revalidate_inode(NFS_SERVER(dir), dir); - if (ret < 0) - return 0; + if (nfs_mapping_need_revalidate_inode(dir)) { + if (rcu_walk) + return 0; + if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) + return 0; + } if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; return 1; @@ -1161,7 +1154,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; goto out_bad; } - goto out_valid_noent; + goto out_valid; } if (is_bad_inode(inode)) { @@ -1184,6 +1177,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; goto out_zap_parent; } + nfs_advise_use_readdirplus(dir); goto out_valid; } @@ -1219,12 +1213,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) nfs_free_fhandle(fhandle); nfs4_label_free(label); + /* set a readdirplus hint that we had a cache miss */ + nfs_force_use_readdirplus(dir); + out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: - /* Success: notify readdir to use READDIRPLUS */ - nfs_advise_use_readdirplus(dir); - out_valid_noent: if (flags & LOOKUP_RCU) { if (parent != ACCESS_ONCE(dentry->d_parent)) return -ECHILD; @@ -1424,8 +1418,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (IS_ERR(res)) goto out_label; - /* Success: notify readdir to use READDIRPLUS */ - nfs_advise_use_readdirplus(dir); + /* Notify readdir to use READDIRPLUS */ + nfs_force_use_readdirplus(dir); no_entry: res = d_splice_alias(inode, dentry); @@ -1467,9 +1461,9 @@ static fmode_t flags_to_mode(int flags) return res; } -static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags) +static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp) { - return alloc_nfs_open_context(dentry, flags_to_mode(open_flags)); + return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp); } static int do_open(struct inode *inode, struct file *filp) @@ -1535,8 +1529,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, return -ENAMETOOLONG; if (open_flags & O_CREAT) { + struct nfs_server *server = NFS_SERVER(dir); + + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + mode &= ~current_umask(); + attr.ia_valid |= ATTR_MODE; - attr.ia_mode = mode & ~current_umask(); + attr.ia_mode = mode; } if (open_flags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; @@ -1554,7 +1553,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, return finish_no_open(file, dentry); } - ctx = create_nfs_open_context(dentry, open_flags); + ctx = create_nfs_open_context(dentry, open_flags, file); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index bd81bcf3ffcf..be88bcdca692 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -105,7 +105,7 @@ struct nfs_direct_req { static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); +static void nfs_direct_write_complete(struct nfs_direct_req *dreq); static void nfs_direct_write_schedule_work(struct work_struct *work); static inline void get_dreq(struct nfs_direct_req *dreq) @@ -684,7 +684,7 @@ out_failed: } if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, dreq->inode); + nfs_direct_write_complete(dreq); } static void nfs_direct_commit_complete(struct nfs_commit_data *data) @@ -717,7 +717,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) } if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) - nfs_direct_write_complete(dreq, data->inode); + nfs_direct_write_complete(dreq); } static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, @@ -768,7 +768,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) } } -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +static void nfs_direct_write_complete(struct nfs_direct_req *dreq) { schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ } @@ -824,7 +824,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) out_put: if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, hdr->inode); + nfs_direct_write_complete(dreq); hdr->release(hdr); } @@ -953,7 +953,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, } if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, dreq->inode); + nfs_direct_write_complete(dreq); return 0; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 9ea85ae23c32..64c11f399b3d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -102,8 +102,11 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); + const unsigned long force_reval = NFS_INO_REVAL_PAGECACHE|NFS_INO_REVAL_FORCED; + unsigned long cache_validity = nfsi->cache_validity; - if (nfs_have_delegated_attributes(inode)) + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) && + (cache_validity & force_reval) != force_reval) goto out_noreval; if (filp->f_flags & O_DIRECT) diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4946ef40ba87..a5589b791439 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -279,8 +279,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, dataserver_retrans, 4, - s->nfs_client->cl_minorversion, - s->nfs_client->cl_rpcclient->cl_auth->au_flavor); + s->nfs_client->cl_minorversion); out_test_devid: if (filelayout_test_devid_unavailable(devid)) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 98ace127bf86..9e111d07f667 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -25,9 +25,20 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) +#define FF_LAYOUTRETURN_MAXERR 20 + static struct group_info *ff_zero_group; +static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr); +static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, + struct nfs42_layoutstat_devinfo *devinfo, + int dev_limit); +static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, + const struct nfs42_layoutstat_devinfo *devinfo, + struct nfs4_ff_layout_mirror *mirror); + static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) { @@ -172,7 +183,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo, spin_lock(&inode->i_lock); list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { - if (mirror->mirror_ds != pos->mirror_ds) + if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0) continue; if (!ff_mirror_match_fh(mirror, pos)) continue; @@ -349,19 +360,6 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) } } -static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls) -{ - struct nfs4_deviceid_node *node; - int i; - - if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS)) - return; - for (i = 0; i < fls->mirror_array_cnt; i++) { - node = &fls->mirror_array[i]->mirror_ds->id_node; - clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); - } -} - static struct pnfs_layout_segment * ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_layoutget_res *lgr, @@ -415,8 +413,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; - struct nfs4_deviceid devid; - struct nfs4_deviceid_node *idnode; struct auth_cred acred = { .group_info = ff_zero_group }; struct rpc_cred __rcu *cred; u32 ds_count, fh_count, id; @@ -441,24 +437,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->ds_count = ds_count; /* deviceid */ - rc = decode_deviceid(&stream, &devid); + rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid); if (rc) goto out_err_free; - idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), - &devid, lh->plh_lc_cred, - gfp_flags); - /* - * upon success, mirror_ds is allocated by previous - * getdeviceinfo, or newly by .alloc_deviceid_node - * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure - */ - if (idnode) - fls->mirror_array[i]->mirror_ds = - FF_LAYOUT_MIRROR_DS(idnode); - else - goto out_err_free; - /* efficiency */ rc = -EIO; p = xdr_inline_decode(&stream, 4); @@ -556,8 +538,6 @@ out_sort_mirrors: rc = ff_layout_check_layout(lgr); if (rc) goto out_err_free; - ff_layout_mark_devices_valid(fls); - ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); out_free_page: @@ -702,6 +682,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode, spin_lock(&mirror->lock); report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); if (report) @@ -718,6 +699,7 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, requested, completed, ktime_get(), task->tk_start); + set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); } @@ -731,6 +713,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode, spin_lock(&mirror->lock); report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); if (report) @@ -750,6 +733,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, spin_lock(&mirror->lock); nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, requested, completed, ktime_get(), task->tk_start); + set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); } @@ -1293,6 +1277,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; + ff_layout_read_record_layoutstats_done(task, hdr); pnfs_read_resend_pnfs(hdr); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1961,38 +1946,88 @@ ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) id_node)); } -static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo, - struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) +static int ff_layout_encode_ioerr(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + const struct nfs4_flexfile_layoutreturn_args *ff_args) { - struct pnfs_layout_hdr *hdr = &flo->generic_hdr; __be32 *start; - int count = 0, ret = 0; start = xdr_reserve_space(xdr, 4); if (unlikely(!start)) return -E2BIG; + *start = cpu_to_be32(ff_args->num_errors); /* This assume we always return _ALL_ layouts */ - spin_lock(&hdr->plh_inode->i_lock); - ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range); - spin_unlock(&hdr->plh_inode->i_lock); + return ff_layout_encode_ds_ioerr(xdr, &ff_args->errors); +} - *start = cpu_to_be32(count); +static void +encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) +{ + __be32 *p; - return ret; + p = xdr_reserve_space(xdr, len); + xdr_encode_opaque_fixed(p, buf, len); +} + +static void +ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr, + const nfs4_stateid *stateid, + const struct nfs42_layoutstat_devinfo *devinfo) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, devinfo->offset); + p = xdr_encode_hyper(p, devinfo->length); + encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); + p = xdr_reserve_space(xdr, 4*8); + p = xdr_encode_hyper(p, devinfo->read_count); + p = xdr_encode_hyper(p, devinfo->read_bytes); + p = xdr_encode_hyper(p, devinfo->write_count); + p = xdr_encode_hyper(p, devinfo->write_bytes); + encode_opaque_fixed(xdr, devinfo->dev_id.data, NFS4_DEVICEID4_SIZE); +} + +static void +ff_layout_encode_ff_iostat(struct xdr_stream *xdr, + const nfs4_stateid *stateid, + const struct nfs42_layoutstat_devinfo *devinfo) +{ + ff_layout_encode_ff_iostat_head(xdr, stateid, devinfo); + ff_layout_encode_ff_layoutupdate(xdr, devinfo, + devinfo->ld_private.data); } /* report nothing for now */ -static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo, - struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) +static void ff_layout_encode_iostats_array(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + struct nfs4_flexfile_layoutreturn_args *ff_args) { __be32 *p; + int i; p = xdr_reserve_space(xdr, 4); - if (likely(p)) - *p = cpu_to_be32(0); + *p = cpu_to_be32(ff_args->num_dev); + for (i = 0; i < ff_args->num_dev; i++) + ff_layout_encode_ff_iostat(xdr, + &args->layout->plh_stateid, + &ff_args->devinfo[i]); +} + +static void +ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo *devinfo, + unsigned int num_entries) +{ + unsigned int i; + + for (i = 0; i < num_entries; i++) { + if (!devinfo[i].ld_private.ops) + continue; + if (!devinfo[i].ld_private.ops->free) + continue; + devinfo[i].ld_private.ops->free(&devinfo[i].ld_private); + } } static struct nfs4_deviceid_node * @@ -2008,24 +2043,91 @@ ff_layout_alloc_deviceid_node(struct nfs_server *server, } static void -ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo, - struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) -{ - struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo); +ff_layout_encode_layoutreturn(struct xdr_stream *xdr, + const void *voidargs, + const struct nfs4_xdr_opaque_data *ff_opaque) +{ + const struct nfs4_layoutreturn_args *args = voidargs; + struct nfs4_flexfile_layoutreturn_args *ff_args = ff_opaque->data; + struct xdr_buf tmp_buf = { + .head = { + [0] = { + .iov_base = page_address(ff_args->pages[0]), + }, + }, + .buflen = PAGE_SIZE, + }; + struct xdr_stream tmp_xdr; __be32 *start; dprintk("%s: Begin\n", __func__); - start = xdr_reserve_space(xdr, 4); - BUG_ON(!start); - ff_layout_encode_ioerr(flo, xdr, args); - ff_layout_encode_iostats(flo, xdr, args); + xdr_init_encode(&tmp_xdr, &tmp_buf, NULL); + + ff_layout_encode_ioerr(&tmp_xdr, args, ff_args); + ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args); + + start = xdr_reserve_space(xdr, 4); + *start = cpu_to_be32(tmp_buf.len); + xdr_write_pages(xdr, ff_args->pages, 0, tmp_buf.len); - *start = cpu_to_be32((xdr->p - start - 1) * 4); dprintk("%s: Return\n", __func__); } +static void +ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args) +{ + struct nfs4_flexfile_layoutreturn_args *ff_args; + + if (!args->data) + return; + ff_args = args->data; + args->data = NULL; + + ff_layout_free_ds_ioerr(&ff_args->errors); + ff_layout_free_iostats_array(ff_args->devinfo, ff_args->num_dev); + + put_page(ff_args->pages[0]); + kfree(ff_args); +} + +const struct nfs4_xdr_opaque_ops layoutreturn_ops = { + .encode = ff_layout_encode_layoutreturn, + .free = ff_layout_free_layoutreturn, +}; + +static int +ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) +{ + struct nfs4_flexfile_layoutreturn_args *ff_args; + struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout); + + ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL); + if (!ff_args) + goto out_nomem; + ff_args->pages[0] = alloc_page(GFP_KERNEL); + if (!ff_args->pages[0]) + goto out_nomem_free; + + INIT_LIST_HEAD(&ff_args->errors); + ff_args->num_errors = ff_layout_fetch_ds_ioerr(args->layout, + &args->range, &ff_args->errors, + FF_LAYOUTRETURN_MAXERR); + + spin_lock(&args->inode->i_lock); + ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, + &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo)); + spin_unlock(&args->inode->i_lock); + + args->ld_private->ops = &layoutreturn_ops; + args->ld_private->data = ff_args; + return 0; +out_nomem_free: + kfree(ff_args); +out_nomem: + return -ENOMEM; +} + static int ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) { @@ -2146,21 +2248,18 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr, } static void -ff_layout_encode_layoutstats(struct xdr_stream *xdr, - struct nfs42_layoutstat_args *args, - struct nfs42_layoutstat_devinfo *devinfo) +ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, + const struct nfs42_layoutstat_devinfo *devinfo, + struct nfs4_ff_layout_mirror *mirror) { - struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private; struct nfs4_pnfs_ds_addr *da; struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; struct nfs_fh *fh = &mirror->fh_versions[0]; - __be32 *p, *start; + __be32 *p; da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); dprintk("%s: DS %s: encoding address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - /* layoutupdate length */ - start = xdr_reserve_space(xdr, 4); /* netaddr4 */ ff_layout_encode_netaddr(xdr, da); /* nfs_fh4 */ @@ -2177,42 +2276,71 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, /* bool */ p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(false); +} + +static void +ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args, + const struct nfs4_xdr_opaque_data *opaque) +{ + struct nfs42_layoutstat_devinfo *devinfo = container_of(opaque, + struct nfs42_layoutstat_devinfo, ld_private); + __be32 *start; + + /* layoutupdate length */ + start = xdr_reserve_space(xdr, 4); + ff_layout_encode_ff_layoutupdate(xdr, devinfo, opaque->data); *start = cpu_to_be32((xdr->p - start - 1) * 4); } +static void +ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque) +{ + struct nfs4_ff_layout_mirror *mirror = opaque->data; + + ff_layout_put_mirror(mirror); +} + +static const struct nfs4_xdr_opaque_ops layoutstat_ops = { + .encode = ff_layout_encode_layoutstats, + .free = ff_layout_free_layoutstats, +}; + static int -ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, - struct pnfs_layout_hdr *lo, +ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, + struct nfs42_layoutstat_devinfo *devinfo, int dev_limit) { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *dev; - struct nfs42_layoutstat_devinfo *devinfo; int i = 0; list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { if (i >= dev_limit) break; - if (!mirror->mirror_ds) + if (IS_ERR_OR_NULL(mirror->mirror_ds)) + continue; + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags)) continue; /* mirror refcount put in cleanup_layoutstats */ if (!atomic_inc_not_zero(&mirror->ref)) continue; dev = &mirror->mirror_ds->id_node; - devinfo = &args->devinfo[i]; memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); devinfo->offset = 0; devinfo->length = NFS4_MAX_UINT64; + spin_lock(&mirror->lock); devinfo->read_count = mirror->read_stat.io_stat.ops_completed; devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; devinfo->write_count = mirror->write_stat.io_stat.ops_completed; devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; + spin_unlock(&mirror->lock); devinfo->layout_type = LAYOUT_FLEX_FILES; - devinfo->layoutstats_encode = ff_layout_encode_layoutstats; - devinfo->layout_private = mirror; + devinfo->ld_private.ops = &layoutstat_ops; + devinfo->ld_private.data = mirror; + devinfo++; i++; } return i; @@ -2222,47 +2350,27 @@ static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) { struct nfs4_flexfile_layout *ff_layout; - struct nfs4_ff_layout_mirror *mirror; - int dev_count = 0; + const int dev_count = PNFS_LAYOUTSTATS_MAXDEV; - spin_lock(&args->inode->i_lock); - ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); - list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { - if (atomic_read(&mirror->ref) != 0) - dev_count ++; - } - spin_unlock(&args->inode->i_lock); /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ - if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) { - dprintk("%s: truncating devinfo to limit (%d:%d)\n", - __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV); - dev_count = PNFS_LAYOUTSTATS_MAXDEV; - } args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO); if (!args->devinfo) return -ENOMEM; spin_lock(&args->inode->i_lock); - args->num_dev = ff_layout_mirror_prepare_stats(args, - &ff_layout->generic_hdr, dev_count); + ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); + args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, + &args->devinfo[0], dev_count); spin_unlock(&args->inode->i_lock); + if (!args->num_dev) { + kfree(args->devinfo); + args->devinfo = NULL; + return -ENOENT; + } return 0; } -static void -ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data) -{ - struct nfs4_ff_layout_mirror *mirror; - int i; - - for (i = 0; i < data->args.num_dev; i++) { - mirror = data->args.devinfo[i].layout_private; - data->args.devinfo[i].layout_private = NULL; - ff_layout_put_mirror(mirror); - } -} - static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", @@ -2284,10 +2392,9 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .read_pagelist = ff_layout_read_pagelist, .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, - .encode_layoutreturn = ff_layout_encode_layoutreturn, + .prepare_layoutreturn = ff_layout_prepare_layoutreturn, .sync = pnfs_nfs_generic_sync, .prepare_layoutstats = ff_layout_prepare_layoutstats, - .cleanup_layoutstats = ff_layout_cleanup_layoutstats, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 3ee0c9fcea76..f4f39b0ab09b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -21,6 +21,7 @@ /* LAYOUTSTATS report interval in ms */ #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) +#define FF_LAYOUTSTATS_MAXDEV 4 struct nfs4_ff_ds_version { u32 version; @@ -73,6 +74,7 @@ struct nfs4_ff_layout_mirror { struct list_head mirrors; u32 ds_count; u32 efficiency; + struct nfs4_deviceid devid; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; @@ -81,12 +83,15 @@ struct nfs4_ff_layout_mirror { struct rpc_cred __rcu *rw_cred; atomic_t ref; spinlock_t lock; + unsigned long flags; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; u32 report_interval; }; +#define NFS4_FF_MIRROR_STAT_AVAIL (0) + struct nfs4_ff_layout_segment { struct pnfs_layout_segment generic_hdr; u64 stripe_unit; @@ -103,6 +108,14 @@ struct nfs4_flexfile_layout { ktime_t last_report_time; /* Layoutstat report times */ }; +struct nfs4_flexfile_layoutreturn_args { + struct list_head errors; + struct nfs42_layoutstat_devinfo devinfo[FF_LAYOUTSTATS_MAXDEV]; + unsigned int num_errors; + unsigned int num_dev; + struct page *pages[1]; +}; + static inline struct nfs4_flexfile_layout * FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) { @@ -180,9 +193,12 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, struct nfs4_ff_layout_mirror *mirror, u64 offset, u64 length, int status, enum nfs_opnum4 opnum, gfp_t gfp_flags); -int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, - struct xdr_stream *xdr, int *count, - const struct pnfs_layout_range *range); +int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); +void ff_layout_free_ds_ioerr(struct list_head *head); +unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + struct list_head *head, + unsigned int maxnum); struct nfs_fh * nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); @@ -197,7 +213,6 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, struct inode *inode); struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred); -bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index f7a3f6b05369..3cc39d1c1206 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -20,9 +20,11 @@ static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS; static unsigned int dataserver_retrans; +static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); + void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { - if (mirror_ds) + if (!IS_ERR_OR_NULL(mirror_ds)) nfs4_put_deviceid_node(&mirror_ds->id_node); } @@ -182,12 +184,29 @@ static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, } static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_mirror *mirror, + bool create) { - if (mirror == NULL || mirror->mirror_ds == NULL) { - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, - lseg); - return false; + if (mirror == NULL || IS_ERR(mirror->mirror_ds)) + goto outerr; + if (mirror->mirror_ds == NULL) { + if (create) { + struct nfs4_deviceid_node *node; + struct pnfs_layout_hdr *lh = lseg->pls_layout; + struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); + + node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), + &mirror->devid, lh->plh_lc_cred, + GFP_KERNEL); + if (node) + mirror_ds = FF_LAYOUT_MIRROR_DS(node); + + /* check for race with another call to this function */ + if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && + mirror_ds != ERR_PTR(-ENODEV)) + nfs4_put_deviceid_node(node); + } else + goto outerr; } if (mirror->mirror_ds->ds == NULL) { struct nfs4_deviceid_node *devid; @@ -196,15 +215,9 @@ static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, return false; } return true; -} - -static u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end : NFS4_MAX_UINT64; +outerr: + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); + return false; } static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, @@ -212,8 +225,8 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, { u64 end; - end = max_t(u64, end_offset(err->offset, err->length), - end_offset(offset, length)); + end = max_t(u64, pnfs_end_offset(err->offset, err->length), + pnfs_end_offset(offset, length)); err->offset = min_t(u64, err->offset, offset); err->length = end - err->offset; } @@ -235,9 +248,9 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); if (ret != 0) return ret; - if (end_offset(e1->offset, e1->length) < e2->offset) + if (pnfs_end_offset(e1->offset, e1->length) < e2->offset) return -1; - if (e1->offset > end_offset(e2->offset, e2->length)) + if (e1->offset > pnfs_end_offset(e2->offset, e2->length)) return 1; /* If ranges overlap or are contiguous, they are the same */ return 0; @@ -263,8 +276,9 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, } /* Entries match, so merge "err" into "dserr" */ extend_ds_error(dserr, err->offset, err->length); - list_del(&err->list); + list_replace(&err->list, &dserr->list); kfree(err); + return; } list_add_tail(&dserr->list, head); @@ -331,7 +345,7 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); struct nfs_fh *fh = NULL; - if (!ff_layout_mirror_valid(lseg, mirror)) { + if (!ff_layout_mirror_valid(lseg, mirror, false)) { pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", __func__, mirror_idx); goto out; @@ -371,7 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; - if (!ff_layout_mirror_valid(lseg, mirror)) { + if (!ff_layout_mirror_valid(lseg, mirror, true)) { pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", __func__, ds_idx); goto out; @@ -393,8 +407,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, - mirror->mirror_ds->ds_versions[0].minor_version, - RPC_AUTH_UNIX); + mirror->mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ if (ds->ds_clp) { @@ -457,28 +470,26 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, } } -static bool is_range_intersecting(u64 offset1, u64 length1, - u64 offset2, u64 length2) +void ff_layout_free_ds_ioerr(struct list_head *head) { - u64 end1 = end_offset(offset1, length1); - u64 end2 = end_offset(offset2, length2); + struct nfs4_ff_layout_ds_err *err; - return (end1 == NFS4_MAX_UINT64 || end1 > offset2) && - (end2 == NFS4_MAX_UINT64 || end2 > offset1); + while (!list_empty(head)) { + err = list_first_entry(head, + struct nfs4_ff_layout_ds_err, + list); + list_del(&err->list); + kfree(err); + } } /* called with inode i_lock held */ -int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, - struct xdr_stream *xdr, int *count, - const struct pnfs_layout_range *range) +int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head) { - struct nfs4_ff_layout_ds_err *err, *n; + struct nfs4_ff_layout_ds_err *err; __be32 *p; - list_for_each_entry_safe(err, n, &flo->error_list, list) { - if (!is_range_intersecting(err->offset, err->length, - range->offset, range->length)) - continue; + list_for_each_entry(err, head, list) { /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) * + array length + deviceid(NFS4_DEVICEID4_SIZE) * + status(4) + opnum(4) @@ -497,17 +508,59 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(err->status); *p++ = cpu_to_be32(err->opnum); - *count += 1; - list_del(&err->list); - dprintk("%s: offset %llu length %llu status %d op %d count %d\n", + dprintk("%s: offset %llu length %llu status %d op %d\n", __func__, err->offset, err->length, err->status, - err->opnum, *count); - kfree(err); + err->opnum); } return 0; } +static +unsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + struct list_head *head, + unsigned int maxnum) +{ + struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo); + struct inode *inode = lo->plh_inode; + struct nfs4_ff_layout_ds_err *err, *n; + unsigned int ret = 0; + + spin_lock(&inode->i_lock); + list_for_each_entry_safe(err, n, &flo->error_list, list) { + if (!pnfs_is_range_intersecting(err->offset, + pnfs_end_offset(err->offset, err->length), + range->offset, + pnfs_end_offset(range->offset, range->length))) + continue; + if (!maxnum) + break; + list_move(&err->list, head); + maxnum--; + ret++; + } + spin_unlock(&inode->i_lock); + return ret; +} + +unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + struct list_head *head, + unsigned int maxnum) +{ + unsigned int ret; + + ret = do_layout_fetch_ds_ioerr(lo, range, head, maxnum); + /* If we're over the max, discard all remaining entries */ + if (ret == maxnum) { + LIST_HEAD(discard); + do_layout_fetch_ds_ioerr(lo, range, &discard, -1); + ff_layout_free_ds_ioerr(&discard); + } + return ret; +} + static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; @@ -516,7 +569,11 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (mirror && mirror->mirror_ds) { + if (mirror) { + if (!mirror->mirror_ds) + return true; + if (IS_ERR(mirror->mirror_ds)) + continue; devid = &mirror->mirror_ds->id_node; if (!ff_layout_test_devid_unavailable(devid)) return true; @@ -534,8 +591,10 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (!mirror || !mirror->mirror_ds) + if (!mirror || IS_ERR(mirror->mirror_ds)) return false; + if (!mirror->mirror_ds) + continue; devid = &mirror->mirror_ds->id_node; if (ff_layout_test_devid_unavailable(devid)) return false; @@ -544,7 +603,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; } -bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) { if (lseg->pls_range.iomode == IOMODE_READ) return ff_read_layout_has_available_ds(lseg); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bf4ec5ecc97e..5864146e05e6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -634,15 +634,28 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); -static void nfs_request_parent_use_readdirplus(struct dentry *dentry) +static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) { struct dentry *parent; + if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) + return; parent = dget_parent(dentry); nfs_force_use_readdirplus(d_inode(parent)); dput(parent); } +static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) +{ + struct dentry *parent; + + if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) + return; + parent = dget_parent(dentry); + nfs_advise_use_readdirplus(d_inode(parent)); + dput(parent); +} + static bool nfs_need_revalidate_inode(struct inode *inode) { if (NFS_I(inode)->cache_validity & @@ -683,10 +696,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) if (need_atime || nfs_need_revalidate_inode(inode)) { struct nfs_server *server = NFS_SERVER(inode); - if (server->caps & NFS_CAP_READDIRPLUS) - nfs_request_parent_use_readdirplus(dentry); + nfs_readdirplus_parent_cache_miss(dentry); err = __nfs_revalidate_inode(server, inode); - } + } else + nfs_readdirplus_parent_cache_hit(dentry); if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); @@ -702,8 +715,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { atomic_set(&l_ctx->count, 1); - l_ctx->lockowner.l_owner = current->files; - l_ctx->lockowner.l_pid = current->tgid; + l_ctx->lockowner = current->files; INIT_LIST_HEAD(&l_ctx->list); atomic_set(&l_ctx->io_count, 0); } @@ -714,9 +726,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context struct nfs_lock_context *pos = head; do { - if (pos->lockowner.l_owner != current->files) - continue; - if (pos->lockowner.l_pid != current->tgid) + if (pos->lockowner != current->files) continue; atomic_inc(&pos->count); return pos; @@ -799,7 +809,9 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) } EXPORT_SYMBOL_GPL(nfs_close_context); -struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode) +struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, + fmode_t f_mode, + struct file *filp) { struct nfs_open_context *ctx; struct rpc_cred *cred = rpc_lookup_cred(); @@ -818,6 +830,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; + ctx->flock_owner = (fl_owner_t)filp; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); @@ -942,7 +955,7 @@ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); @@ -1099,11 +1112,17 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return 0; } -static bool nfs_mapping_need_revalidate_inode(struct inode *inode) +bool nfs_mapping_need_revalidate_inode(struct inode *inode) { - if (nfs_have_delegated_attributes(inode)) - return false; - return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) + unsigned long cache_validity = NFS_I(inode)->cache_validity; + + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { + const unsigned long force_reval = + NFS_INO_REVAL_PAGECACHE|NFS_INO_REVAL_FORCED; + return (cache_validity & force_reval) == force_reval; + } + + return (cache_validity & NFS_INO_REVAL_PAGECACHE) || nfs_attribute_timeout(inode) || NFS_STALE(inode); } @@ -1317,7 +1336,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) - nfs_set_cache_invalid(inode, invalid); + nfs_set_cache_invalid(inode, invalid | NFS_INO_REVAL_FORCED); nfsi->read_cache_jiffies = fattr->time_start; return 0; @@ -2015,7 +2034,7 @@ static void nfsiod_stop(void) destroy_workqueue(wq); } -int nfs_net_id; +unsigned int nfs_net_id; EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 80bcc0befb07..6b79c2ca9b9a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -154,8 +154,7 @@ extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); -struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, - rpc_authflavor_t); +struct nfs_client *nfs_get_client(const struct nfs_client_initdata *); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); @@ -194,14 +193,13 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, - u32 minor_version, - rpc_authflavor_t au_flavor); + u32 minor_version); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, - unsigned int ds_retrans, rpc_authflavor_t au_flavor); + unsigned int ds_retrans); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -346,6 +344,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ +extern void nfs_advise_use_readdirplus(struct inode *dir); extern void nfs_force_use_readdirplus(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index fbce0d885d4c..5fbd2bde91ba 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -35,6 +35,6 @@ struct nfs_net { #endif }; -extern int nfs_net_id; +extern unsigned int nfs_net_id; #endif diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index ee753547fb0a..7879f2a0fcfd 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -78,8 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, */ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, - int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, - rpc_authflavor_t au_flavor) + int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; struct nfs_client *mds_clp = mds_srv->nfs_client; @@ -106,7 +105,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, au_flavor); + clp = nfs_get_client(&cl_init); return clp; } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 608501971fe0..d12ff9385f49 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -397,10 +397,13 @@ static void nfs42_layoutstat_release(void *calldata) { struct nfs42_layoutstat_data *data = calldata; - struct nfs_server *nfss = NFS_SERVER(data->args.inode); + struct nfs42_layoutstat_devinfo *devinfo = data->args.devinfo; + int i; - if (nfss->pnfs_curr_ld->cleanup_layoutstats) - nfss->pnfs_curr_ld->cleanup_layoutstats(data); + for (i = 0; i < data->args.num_dev; i++) { + if (devinfo[i].ld_private.ops && devinfo[i].ld_private.ops->free) + devinfo[i].ld_private.ops->free(&devinfo[i].ld_private); + } pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout); smp_mb__before_atomic(); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 8b2605882a20..6c7296454bbc 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -181,8 +181,9 @@ static void encode_layoutstats(struct xdr_stream *xdr, NFS4_DEVICEID4_SIZE); /* Encode layoutupdate4 */ *p++ = cpu_to_be32(devinfo->layout_type); - if (devinfo->layoutstats_encode != NULL) - devinfo->layoutstats_encode(xdr, args, devinfo); + if (devinfo->ld_private.ops) + devinfo->ld_private.ops->encode(xdr, args, + &devinfo->ld_private); else encode_uint32(xdr, 0); } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 1452177c822d..665165833660 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -457,7 +457,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *, extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, - const struct nfs_lockowner *, nfs4_stateid *, + const struct nfs_lock_context *, nfs4_stateid *, struct rpc_cred **); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 074ac7131459..5ae9d64ea08b 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -464,6 +464,11 @@ static bool nfs4_match_client_owner_id(const struct nfs_client *clp1, return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0; } +static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2) +{ + return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0; +} + /** * nfs40_walk_client_list - Find server that recognizes a client ID * @@ -521,7 +526,21 @@ int nfs40_walk_client_list(struct nfs_client *new, if (!nfs4_match_client_owner_id(pos, new)) continue; - + /* + * We just sent a new SETCLIENTID, which should have + * caused the server to return a new cl_confirm. So if + * cl_confirm is the same, then this is a different + * server that just returned the same cl_confirm by + * coincidence: + */ + if ((new != pos) && nfs4_same_verifier(&pos->cl_confirm, + &new->cl_confirm)) + continue; + /* + * But if the cl_confirm's are different, then the only + * way that a SETCLIENTID_CONFIRM to pos can succeed is + * if new and pos point to the same server: + */ atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); @@ -534,6 +553,7 @@ int nfs40_walk_client_list(struct nfs_client *new, break; case 0: nfs4_swap_callback_idents(pos, new); + pos->cl_confirm = new->cl_confirm; prev = NULL; *result = pos; @@ -881,7 +901,6 @@ static int nfs4_set_client(struct nfs_server *server, const struct sockaddr *addr, const size_t addrlen, const char *ip_addr, - rpc_authflavor_t authflavour, int proto, const struct rpc_timeout *timeparms, u32 minorversion, struct net *net) { @@ -907,7 +926,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, authflavour); + clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; @@ -948,7 +967,7 @@ error: struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, - u32 minor_version, rpc_authflavor_t au_flavor) + u32 minor_version) { struct rpc_timeout ds_timeout; struct nfs_client *mds_clp = mds_srv->nfs_client; @@ -979,7 +998,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, au_flavor); + clp = nfs_get_client(&cl_init); dprintk("<-- %s %p\n", __func__, clp); return clp; @@ -1103,7 +1122,6 @@ static int nfs4_init_server(struct nfs_server *server, (const struct sockaddr *)&data->nfs_server.address, data->nfs_server.addrlen, data->client_address, - data->selected_flavor, data->nfs_server.protocol, &timeparms, data->minorversion, @@ -1200,7 +1218,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, data->addr, data->addrlen, parent_client->cl_ipaddr, - data->authflavor, rpc_protocol(parent_server->client), parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, @@ -1311,7 +1328,6 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, nfs_server_remove_lists(server); error = nfs4_set_client(server, hostname, sap, salen, buf, - clp->cl_rpcclient->cl_auth->au_flavor, clp->cl_proto, clnt->cl_timeout, clp->cl_minorversion, net); nfs_put_client(clp); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 89a77950e0b0..0efba77789b9 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 241da19b7da4..d33242c8d95d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -94,7 +94,7 @@ static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fa static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs_open_context *ctx, struct nfs4_label *ilabel, struct nfs4_label *olabel); #ifdef CONFIG_NFS_V4_1 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, @@ -226,7 +226,6 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { static const u32 nfs4_open_noattr_bitmap[3] = { FATTR4_WORD0_TYPE - | FATTR4_WORD0_CHANGE | FATTR4_WORD0_FILEID, }; @@ -817,6 +816,10 @@ static int nfs41_sequence_process(struct rpc_task *task, case -NFS4ERR_SEQ_FALSE_RETRY: ++slot->seq_nr; goto retry_nowait; + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_BADSESSION: + nfs4_schedule_session_recovery(session, res->sr_status); + goto retry_nowait; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -1221,6 +1224,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, atomic_inc(&sp->so_count); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); + p->o_arg.umask = current_umask(); + p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); p->o_arg.share_access = nfs4_map_atomic_open_share(server, fmode, flags); /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS @@ -1228,8 +1233,16 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, if (!(flags & O_EXCL)) { /* ask server to check for all possible rights as results * are cached */ - p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | - NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; + switch (p->o_arg.claim) { + default: + break; + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + p->o_arg.access = NFS4_ACCESS_READ | + NFS4_ACCESS_MODIFY | + NFS4_ACCESS_EXTEND | + NFS4_ACCESS_EXECUTE; + } } p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); @@ -1239,7 +1252,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; p->o_arg.label = nfs4_label_copy(p->a_label, label); - p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); switch (p->o_arg.claim) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_DELEGATE_CUR: @@ -2819,7 +2831,7 @@ static int _nfs4_do_open(struct inode *dir, nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, opendata->o_res.f_attr, sattr, - state, label, olabel); + ctx, label, olabel); if (status == 0) { nfs_setattr_update_inode(state->inode, sattr, opendata->o_res.f_attr); @@ -2914,7 +2926,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_setattrargs *arg, struct nfs_setattrres *res, struct rpc_cred *cred, - struct nfs4_state *state) + struct nfs_open_context *ctx) { struct nfs_server *server = NFS_SERVER(inode); struct rpc_message msg = { @@ -2937,15 +2949,17 @@ static int _nfs4_do_setattr(struct inode *inode, if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { /* Use that stateid */ - } else if (truncate && state != NULL) { - struct nfs_lockowner lockowner = { - .l_owner = current->files, - .l_pid = current->tgid, - }; - if (!nfs4_valid_open_stateid(state)) + } else if (truncate && ctx != NULL) { + struct nfs_lock_context *l_ctx; + if (!nfs4_valid_open_stateid(ctx->state)) return -EBADF; - if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, - &arg->stateid, &delegation_cred) == -EIO) + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) + return PTR_ERR(l_ctx); + status = nfs4_select_rw_stateid(ctx->state, FMODE_WRITE, l_ctx, + &arg->stateid, &delegation_cred); + nfs_put_lock_context(l_ctx); + if (status == -EIO) return -EBADF; } else nfs4_stateid_copy(&arg->stateid, &zero_stateid); @@ -2955,7 +2969,7 @@ static int _nfs4_do_setattr(struct inode *inode, status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1); put_rpccred(delegation_cred); - if (status == 0 && state != NULL) + if (status == 0 && ctx != NULL) renew_lease(server, timestamp); trace_nfs4_setattr(inode, &arg->stateid, status); return status; @@ -2963,10 +2977,11 @@ static int _nfs4_do_setattr(struct inode *inode, static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs_open_context *ctx, struct nfs4_label *ilabel, struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_state *state = ctx ? ctx->state : NULL; struct nfs_setattrargs arg = { .fh = NFS_FH(inode), .iap = sattr, @@ -2991,7 +3006,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, arg.bitmask = nfs4_bitmask(server, olabel); do { - err = _nfs4_do_setattr(inode, &arg, &res, cred, state); + err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -3028,10 +3043,15 @@ struct nfs4_closedata { struct nfs4_state *state; struct nfs_closeargs arg; struct nfs_closeres res; + struct { + struct nfs4_layoutreturn_args arg; + struct nfs4_layoutreturn_res res; + struct nfs4_xdr_opaque_data ld_private; + u32 roc_barrier; + bool roc; + } lr; struct nfs_fattr fattr; unsigned long timestamp; - bool roc; - u32 roc_barrier; }; static void nfs4_free_closedata(void *data) @@ -3040,8 +3060,9 @@ static void nfs4_free_closedata(void *data) struct nfs4_state_owner *sp = calldata->state->owner; struct super_block *sb = calldata->state->inode->i_sb; - if (calldata->roc) - pnfs_roc_release(calldata->state->inode); + if (calldata->lr.roc) + pnfs_roc_release(&calldata->lr.arg, &calldata->lr.res, + calldata->res.lr_ret); nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); @@ -3060,15 +3081,38 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); + + /* Handle Layoutreturn errors */ + if (calldata->arg.lr_args && task->tk_status != 0) { + switch (calldata->res.lr_ret) { + default: + calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; + break; + case 0: + calldata->arg.lr_args = NULL; + calldata->res.lr_res = NULL; + break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_UNKNOWN_LAYOUTTYPE: + case -NFS4ERR_WRONG_CRED: + calldata->arg.lr_args = NULL; + calldata->res.lr_res = NULL; + calldata->res.lr_ret = 0; + rpc_restart_call_prepare(task); + return; + } + } + /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ switch (task->tk_status) { case 0: res_stateid = &calldata->res.stateid; - if (calldata->roc) - pnfs_roc_set_barrier(state->inode, - calldata->roc_barrier); renew_lease(server, calldata->timestamp); break; case -NFS4ERR_ADMIN_REVOKED: @@ -3144,15 +3188,20 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_no_action; } - if (nfs4_wait_on_layoutreturn(inode, task)) { + if (!calldata->lr.roc && nfs4_wait_on_layoutreturn(inode, task)) { nfs_release_seqid(calldata->arg.seqid); goto out_wait; } - if (calldata->arg.fmode == 0) + if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; - if (calldata->roc) - pnfs_roc_get_barrier(inode, &calldata->roc_barrier); + + /* Close-to-open cache consistency revalidation */ + if (!nfs4_have_delegation(inode, FMODE_READ)) + calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; + else + calldata->arg.bitmask = NULL; + } calldata->arg.share_access = nfs4_map_atomic_open_share(NFS_SERVER(inode), @@ -3179,13 +3228,6 @@ static const struct rpc_call_ops nfs4_close_ops = { .rpc_release = nfs4_free_closedata, }; -static bool nfs4_roc(struct inode *inode) -{ - if (!nfs_have_layout(inode)) - return false; - return pnfs_roc(inode); -} - /* * It is possible for data to be read/written from a mem-mapped file * after the sys_close call (which hits the vfs layer as a flush). @@ -3233,11 +3275,17 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) if (IS_ERR(calldata->arg.seqid)) goto out_free_calldata; calldata->arg.fmode = 0; - calldata->arg.bitmask = server->cache_consistency_bitmask; + calldata->lr.arg.ld_private = &calldata->lr.ld_private; calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; - calldata->roc = nfs4_roc(state->inode); + calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; + calldata->lr.roc = pnfs_roc(state->inode, + &calldata->lr.arg, &calldata->lr.res, msg.rpc_cred); + if (calldata->lr.roc) { + calldata->arg.lr_args = &calldata->lr.arg; + calldata->res.lr_res = &calldata->lr.res; + } nfs_sb_active(calldata->inode->i_sb); msg.rpc_argp = &calldata->arg; @@ -3290,7 +3338,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) #define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) -#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_MODE_UMASK - 1UL) static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -3687,7 +3735,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, { struct inode *inode = d_inode(dentry); struct rpc_cred *cred = NULL; - struct nfs4_state *state = NULL; + struct nfs_open_context *ctx = NULL; struct nfs4_label *label = NULL; int status; @@ -3708,20 +3756,17 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, /* Search for an existing open(O_WRITE) file */ if (sattr->ia_valid & ATTR_FILE) { - struct nfs_open_context *ctx; ctx = nfs_file_open_context(sattr->ia_file); - if (ctx) { + if (ctx) cred = ctx->cred; - state = ctx->state; - } } label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); if (IS_ERR(label)) return PTR_ERR(label); - status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); + status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label); if (status == 0) { nfs_setattr_update_inode(inode, sattr, fattr); nfs_setsecurity(inode, fattr, label); @@ -3966,18 +4011,20 @@ static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_label l, *ilabel = NULL; struct nfs_open_context *ctx; struct nfs4_state *state; int status = 0; - ctx = alloc_nfs_open_context(dentry, FMODE_READ); + ctx = alloc_nfs_open_context(dentry, FMODE_READ, NULL); if (IS_ERR(ctx)) return PTR_ERR(ctx); ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); - sattr->ia_mode &= ~current_umask(); + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + sattr->ia_mode &= ~current_umask(); state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL); if (IS_ERR(state)) { status = PTR_ERR(state); @@ -4185,6 +4232,7 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, data->arg.attrs = sattr; data->arg.ftype = ftype; data->arg.bitmask = nfs4_bitmask(server, data->label); + data->arg.umask = current_umask(); data->res.server = server; data->res.fh = &data->fh; data->res.fattr = &data->fattr; @@ -4282,13 +4330,15 @@ out: static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_label l, *label = NULL; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); - sattr->ia_mode &= ~current_umask(); + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + sattr->ia_mode &= ~current_umask(); do { err = _nfs4_proc_mkdir(dir, dentry, sattr, label); trace_nfs4_mkdir(dir, &dentry->d_name, err); @@ -4391,13 +4441,15 @@ out: static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_label l, *label = NULL; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); - sattr->ia_mode &= ~current_umask(); + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + sattr->ia_mode &= ~current_umask(); do { err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev); trace_nfs4_mknod(dir, &dentry->d_name, err); @@ -4541,11 +4593,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_lock_context *l_ctx, fmode_t fmode) { - const struct nfs_lockowner *lockowner = NULL; - - if (l_ctx != NULL) - lockowner = &l_ctx->lockowner; - return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL); + return nfs4_select_rw_stateid(ctx->state, fmode, l_ctx, stateid, NULL); } EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); @@ -5564,11 +5612,16 @@ struct nfs4_delegreturndata { struct nfs_fh fh; nfs4_stateid stateid; unsigned long timestamp; + struct { + struct nfs4_layoutreturn_args arg; + struct nfs4_layoutreturn_res res; + struct nfs4_xdr_opaque_data ld_private; + u32 roc_barrier; + bool roc; + } lr; struct nfs_fattr fattr; int rpc_status; struct inode *inode; - bool roc; - u32 roc_barrier; }; static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) @@ -5579,6 +5632,32 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) return; trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); + + /* Handle Layoutreturn errors */ + if (data->args.lr_args && task->tk_status != 0) { + switch(data->res.lr_ret) { + default: + data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; + break; + case 0: + data->args.lr_args = NULL; + data->res.lr_res = NULL; + break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_UNKNOWN_LAYOUTTYPE: + case -NFS4ERR_WRONG_CRED: + data->args.lr_args = NULL; + data->res.lr_res = NULL; + data->res.lr_ret = 0; + rpc_restart_call_prepare(task); + return; + } + } + switch (task->tk_status) { case 0: renew_lease(data->res.server, data->timestamp); @@ -5602,8 +5681,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } } data->rpc_status = task->tk_status; - if (data->roc && data->rpc_status == 0) - pnfs_roc_set_barrier(data->inode, data->roc_barrier); } static void nfs4_delegreturn_release(void *calldata) @@ -5612,8 +5689,9 @@ static void nfs4_delegreturn_release(void *calldata) struct inode *inode = data->inode; if (inode) { - if (data->roc) - pnfs_roc_release(inode); + if (data->lr.roc) + pnfs_roc_release(&data->lr.arg, &data->lr.res, + data->res.lr_ret); nfs_iput_and_deactive(inode); } kfree(calldata); @@ -5625,12 +5703,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (nfs4_wait_on_layoutreturn(d_data->inode, task)) + if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) return; - if (d_data->roc) - pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier); - nfs4_setup_sequence(d_data->res.server, &d_data->args.seq_args, &d_data->res.seq_res, @@ -5676,12 +5751,22 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co nfs4_stateid_copy(&data->stateid, stateid); data->res.fattr = &data->fattr; data->res.server = server; + data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; + data->lr.arg.ld_private = &data->lr.ld_private; nfs_fattr_init(data->res.fattr); data->timestamp = jiffies; data->rpc_status = 0; + data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, cred); data->inode = nfs_igrab_and_active(inode); - if (data->inode) - data->roc = nfs4_roc(inode); + if (data->inode) { + if (data->lr.roc) { + data->args.lr_args = &data->lr.arg; + data->res.lr_res = &data->lr.res; + } + } else if (data->lr.roc) { + pnfs_roc_release(&data->lr.arg, &data->lr.res, 0); + data->lr.roc = false; + } task_setup_data.callback_data = data; msg.rpc_argp = &data->args; @@ -8559,21 +8644,13 @@ static void nfs4_layoutreturn_release(void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct pnfs_layout_hdr *lo = lrp->args.layout; - LIST_HEAD(freeme); dprintk("--> %s\n", __func__); - spin_lock(&lo->plh_inode->i_lock); - if (lrp->res.lrs_present) { - pnfs_mark_matching_lsegs_invalid(lo, &freeme, - &lrp->args.range, - be32_to_cpu(lrp->args.stateid.seqid)); - pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - } else - pnfs_mark_layout_stateid_invalid(lo, &freeme); - pnfs_clear_layoutreturn_waitbit(lo); - spin_unlock(&lo->plh_inode->i_lock); + pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range, + lrp->res.lrs_present ? &lrp->res.stateid : NULL); nfs4_sequence_free_slot(&lrp->res.seq_res); - pnfs_free_lseg_list(&freeme); + if (lrp->ld_private.ops && lrp->ld_private.ops->free) + lrp->ld_private.ops->free(&lrp->ld_private); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); kfree(calldata); diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index a61350f75c74..769b85655c4b 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -169,7 +169,7 @@ bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid) { if (slotid <= tbl->max_slotid) - return nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); + return nfs4_find_or_create_slot(tbl, slotid, 0, GFP_NOWAIT); return ERR_PTR(-E2BIG); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0959c9661662..95baf7d340f0 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -800,11 +800,13 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) * that is compatible with current->files */ static struct nfs4_lock_state * -__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +__nfs4_find_lock_state(struct nfs4_state *state, + fl_owner_t fl_owner, fl_owner_t fl_owner2) { struct nfs4_lock_state *pos; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner != fl_owner) + if (pos->ls_owner != fl_owner && + pos->ls_owner != fl_owner2) continue; atomic_inc(&pos->ls_count); return pos; @@ -857,7 +859,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ for(;;) { spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, owner); + lsp = __nfs4_find_lock_state(state, owner, 0); if (lsp != NULL) break; if (new != NULL) { @@ -939,22 +941,23 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) static int nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, - const struct nfs_lockowner *lockowner) + const struct nfs_lock_context *l_ctx) { struct nfs4_lock_state *lsp; - fl_owner_t fl_owner; + fl_owner_t fl_owner, fl_flock_owner; int ret = -ENOENT; - - if (lockowner == NULL) + if (l_ctx == NULL) goto out; if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) goto out; - fl_owner = lockowner->l_owner; + fl_owner = l_ctx->lockowner; + fl_flock_owner = l_ctx->open_context->flock_owner; + spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner); + lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner); if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) ret = -EIO; else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { @@ -986,7 +989,7 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) * requests. */ int nfs4_select_rw_stateid(struct nfs4_state *state, - fmode_t fmode, const struct nfs_lockowner *lockowner, + fmode_t fmode, const struct nfs_lock_context *l_ctx, nfs4_stateid *dst, struct rpc_cred **cred) { int ret; @@ -995,7 +998,7 @@ int nfs4_select_rw_stateid(struct nfs4_state *state, return -EIO; if (cred != NULL) *cred = NULL; - ret = nfs4_copy_lock_stateid(dst, state, lockowner); + ret = nfs4_copy_lock_stateid(dst, state, l_ctx); if (ret == -EIO) /* A lost lock - don't even consider delegations */ goto out; @@ -2190,7 +2193,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); } - nfs4_schedule_lease_recovery(clp); + nfs4_schedule_state_manager(clp); } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fc89e5ed07ee..1af6268a7d8c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,6 +52,7 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> +#include <linux/fs_struct.h> #include "nfs4_fs.h" #include "internal.h" @@ -415,6 +416,8 @@ static int nfs4_stat_to_errno(int); #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 +#define encode_layoutreturn_maxsz 0 +#define decode_layoutreturn_maxsz 0 #endif /* CONFIG_NFS_V4_1 */ #define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ @@ -499,22 +502,22 @@ static int nfs4_stat_to_errno(int); (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_open_downgrade_maxsz + \ - encode_getattr_maxsz) + encode_open_downgrade_maxsz) #define NFS4_dec_open_downgrade_sz \ (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_open_downgrade_maxsz + \ - decode_getattr_maxsz) + decode_open_downgrade_maxsz) #define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ + encode_layoutreturn_maxsz + \ encode_close_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ + decode_layoutreturn_maxsz + \ decode_close_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ @@ -708,10 +711,13 @@ static int nfs4_stat_to_errno(int); #define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ + encode_layoutreturn_maxsz + \ encode_delegreturn_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutreturn_maxsz + \ decode_delegreturn_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ @@ -1003,7 +1009,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, const struct nfs_server *server, - bool excl_check) + bool excl_check, const umode_t *umask) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -1017,18 +1023,21 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, /* * We reserve enough space to write the entire attribute buffer at once. - * In the worst-case, this would be - * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) - * = 40 bytes, plus any contribution from variable-length fields - * such as owner/group. */ if (iap->ia_valid & ATTR_SIZE) { bmval[0] |= FATTR4_WORD0_SIZE; len += 8; } + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + umask = NULL; if (iap->ia_valid & ATTR_MODE) { - bmval[1] |= FATTR4_WORD1_MODE; - len += 4; + if (umask) { + bmval[2] |= FATTR4_WORD2_MODE_UMASK; + len += 8; + } else { + bmval[1] |= FATTR4_WORD1_MODE; + len += 4; + } } if (iap->ia_valid & ATTR_UID) { owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); @@ -1129,6 +1138,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, *p++ = cpu_to_be32(label->len); p = xdr_encode_opaque_fixed(p, label->label, label->len); } + if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); + *p++ = cpu_to_be32(*umask); + } /* out: */ } @@ -1183,7 +1196,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->label, create->server, false); + encode_attrs(xdr, create->attrs, create->label, create->server, false, + &create->umask); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1403,11 +1417,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, + &arg->umask); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, + &arg->umask); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1416,7 +1432,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op case NFS4_CREATE_EXCLUSIVE4_1: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true, + &arg->umask); } } @@ -1672,7 +1689,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, arg->label, server, false); + encode_attrs(xdr, arg->iap, arg->label, server, false, NULL); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -2015,6 +2032,7 @@ encode_layoutreturn(struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args, struct compound_hdr *hdr) { + const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld; __be32 *p; encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); @@ -2029,10 +2047,11 @@ encode_layoutreturn(struct xdr_stream *xdr, spin_lock(&args->inode->i_lock); encode_nfs4_stateid(xdr, &args->stateid); spin_unlock(&args->inode->i_lock); - if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { - NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( - NFS_I(args->inode)->layout, xdr, args); - } else + if (args->ld_private->ops && args->ld_private->ops->encode) + args->ld_private->ops->encode(xdr, args, args->ld_private); + else if (lr_ops->encode_layoutreturn) + lr_ops->encode_layoutreturn(xdr, args); + else encode_uint32(xdr, 0); } @@ -2062,6 +2081,13 @@ static void encode_free_stateid(struct xdr_stream *xdr, encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); encode_nfs4_stateid(xdr, &args->stateid); } +#else +static inline void +encode_layoutreturn(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + struct compound_hdr *hdr) +{ +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2249,8 +2275,11 @@ static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); + if (args->lr_args) + encode_layoutreturn(xdr, args->lr_args, &hdr); encode_close(xdr, args, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask != NULL) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2328,7 +2357,6 @@ static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_open_downgrade(xdr, args, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2671,6 +2699,8 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); + if (args->lr_args) + encode_layoutreturn(xdr, args->lr_args, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); encode_delegreturn(xdr, args->stateid, &hdr); encode_nops(&hdr); @@ -6089,6 +6119,13 @@ static int decode_free_stateid(struct xdr_stream *xdr, res->status = decode_op_hdr(xdr, OP_FREE_STATEID); return res->status; } +#else +static inline +int decode_layoutreturn(struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + return 0; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -6115,9 +6152,6 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, if (status) goto out; status = decode_open_downgrade(xdr, res); - if (status != 0) - goto out; - decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6444,6 +6478,12 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; + if (res->lr_res) { + status = decode_layoutreturn(xdr, res->lr_res); + res->lr_ret = status; + if (status) + goto out; + } status = decode_close(xdr, res); if (status != 0) goto out; @@ -6920,6 +6960,12 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, status = decode_putfh(xdr); if (status != 0) goto out; + if (res->lr_res) { + status = decode_layoutreturn(xdr, res->lr_res); + res->lr_ret = status; + if (status) + goto out; + } status = decode_getfattr(xdr, res->fattr, res->server); if (status != 0) goto out; diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 919efd4a1a23..2a4cdce939a0 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -504,10 +504,10 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) } void -objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, - struct xdr_stream *xdr, +objlayout_encode_layoutreturn(struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args) { + struct pnfs_layout_hdr *pnfslay = args->layout; struct objlayout *objlay = OBJLAYOUT(pnfslay); struct objlayout_io_res *oir, *tmp; __be32 *start; diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 2641dbad345c..fc94a5872ed4 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -175,7 +175,6 @@ extern void objlayout_encode_layoutcommit( const struct nfs4_layoutcommit_args *); extern void objlayout_encode_layoutreturn( - struct pnfs_layout_hdr *, struct xdr_stream *, const struct nfs4_layoutreturn_args *); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 965db474f4b0..6e629b856a00 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -867,8 +867,7 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) static bool nfs_match_lock_context(const struct nfs_lock_context *l1, const struct nfs_lock_context *l2) { - return l1->lockowner.l_owner == l2->lockowner.l_owner - && l1->lockowner.l_pid == l2->lockowner.l_pid; + return l1->lockowner == l2->lockowner; } /** diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 259ef85f435a..896df7bdf85f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -54,6 +54,12 @@ static DEFINE_SPINLOCK(pnfs_spinlock); static LIST_HEAD(pnfs_modules_tbl); static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo); +static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, + struct list_head *free_me, + const struct pnfs_layout_range *range, + u32 seq); +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list); /* Return the registered pnfs layout driver module matching given id */ static struct pnfs_layoutdriver_type * @@ -299,6 +305,49 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) } } +static void +pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, + u32 seq) +{ + if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) + iomode = IOMODE_ANY; + lo->plh_return_iomode = iomode; + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + if (seq != 0) { + WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); + lo->plh_return_seq = seq; + } +} + +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + +static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) +{ + clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); + clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags); + smp_mb__after_atomic(); + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); +} + +static void +pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + clear_bit(NFS_LSEG_ROC, &lseg->pls_flags); + clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + pnfs_lseg_dec_and_remove_zero(lseg, free_me); + if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + pnfs_lseg_dec_and_remove_zero(lseg, free_me); +} + /* * Mark a pnfs_layout_hdr and all associated layout segments as invalid * @@ -315,9 +364,17 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, .offset = 0, .length = NFS4_MAX_UINT64, }; + struct pnfs_layout_segment *lseg, *next; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0); + pnfs_clear_layoutreturn_info(lo); + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) + pnfs_clear_lseg_state(lseg, lseg_list); + pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && + !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) + pnfs_clear_layoutreturn_waitbit(lo); + return !list_empty(&lo->plh_segs); } static int @@ -396,27 +453,42 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) { - struct inode *ino = lseg->pls_layout->plh_inode; - - NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + if (lseg != NULL) { + struct inode *inode = lseg->pls_layout->plh_inode; + NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg); + } } static void pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { - struct inode *inode = lo->plh_inode; - WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); - if (list_empty(&lo->plh_segs)) { + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + return; + if (list_empty(&lo->plh_segs) && + !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && + !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { if (atomic_read(&lo->plh_outstanding) == 0) set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } - rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); +} + +static bool +pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && + pnfs_layout_is_valid(lo)) { + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); + list_move_tail(&lseg->pls_list, &lo->plh_return_segs); + return true; + } + return false; } void @@ -442,6 +514,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); + if (pnfs_cache_lseg_for_layoutreturn(lo, lseg)) + lseg = NULL; spin_unlock(&inode->i_lock); pnfs_free_lseg(lseg); pnfs_put_layout_hdr(lo); @@ -482,22 +556,15 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) struct pnfs_layout_hdr *lo = lseg->pls_layout; if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) return; - pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); - pnfs_free_lseg_async(lseg); + if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) { + pnfs_get_layout_hdr(lo); + pnfs_free_lseg_async(lseg); + } } } EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); -static u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end : NFS4_MAX_UINT64; -} - /* * is l2 fully contained in l1? * start1 end1 @@ -510,33 +577,13 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, const struct pnfs_layout_range *l2) { u64 start1 = l1->offset; - u64 end1 = end_offset(start1, l1->length); + u64 end1 = pnfs_end_offset(start1, l1->length); u64 start2 = l2->offset; - u64 end2 = end_offset(start2, l2->length); + u64 end2 = pnfs_end_offset(start2, l2->length); return (start1 <= start2) && (end1 >= end2); } -/* - * is l1 and l2 intersecting? - * start1 end1 - * [----------------------------------) - * start2 end2 - * [----------------) - */ -static bool -pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, - const struct pnfs_layout_range *l2) -{ - u64 start1 = l1->offset; - u64 end1 = end_offset(start1, l1->length); - u64 start2 = l2->offset; - u64 end2 = end_offset(start2, l2->length); - - return (end1 == NFS4_MAX_UINT64 || end1 > start2) && - (end2 == NFS4_MAX_UINT64 || end2 > start1); -} - static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) { @@ -637,6 +684,20 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void +pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, + struct list_head *free_me, + const struct pnfs_layout_range *range, + u32 seq) +{ + struct pnfs_layout_segment *lseg, *next; + + list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) { + if (pnfs_match_lseg_recall(lseg, range, seq)) + list_move_tail(&lseg->pls_list, free_me); + } +} + /* note free_me must contain lsegs from a single layout_hdr */ void pnfs_free_lseg_list(struct list_head *free_me) @@ -701,6 +762,8 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, struct inode *inode; list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) + continue; inode = igrab(lo->plh_inode); if (inode == NULL) continue; @@ -816,14 +879,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } -static void -pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) -{ - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); -} - /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -941,12 +996,31 @@ static void pnfs_clear_layoutcommit(struct inode *inode, } } -void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) +void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range, + const nfs4_stateid *stateid) { - clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); - smp_mb__after_atomic(); - wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); - rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); + struct inode *inode = lo->plh_inode; + LIST_HEAD(freeme); + + spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo) || !arg_stateid || + !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + goto out_unlock; + if (stateid) { + u32 seq = be32_to_cpu(arg_stateid->seqid); + + pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); + pnfs_free_returned_lsegs(lo, &freeme, range, seq); + pnfs_set_layout_stateid(lo, stateid, true); + } else + pnfs_mark_layout_stateid_invalid(lo, &freeme); +out_unlock: + pnfs_clear_layoutreturn_waitbit(lo); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); + } static bool @@ -957,8 +1031,9 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, /* Serialise LAYOUTGET/LAYOUTRETURN */ if (atomic_read(&lo->plh_outstanding) != 0) return false; - if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) return false; + set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { if (stateid != NULL) { @@ -978,11 +1053,29 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, return true; } +static void +pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args, + struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid, + enum pnfs_iomode iomode) +{ + struct inode *inode = lo->plh_inode; + + args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id; + args->inode = inode; + args->range.iomode = iomode; + args->range.offset = 0; + args->range.length = NFS4_MAX_UINT64; + args->layout = lo; + nfs4_stateid_copy(&args->stateid, stateid); +} + static int pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync) { struct inode *ino = lo->plh_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; struct nfs4_layoutreturn *lrp; int status = 0; @@ -996,15 +1089,12 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, goto out; } - nfs4_stateid_copy(&lrp->args.stateid, stateid); - lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; - lrp->args.inode = ino; - lrp->args.range.iomode = iomode; - lrp->args.range.offset = 0; - lrp->args.range.length = NFS4_MAX_UINT64; - lrp->args.layout = lo; + pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode); + lrp->args.ld_private = &lrp->ld_private; lrp->clp = NFS_SERVER(ino)->nfs_client; lrp->cred = lo->plh_lc_cred; + if (ld->prepare_layoutreturn) + ld->prepare_layoutreturn(&lrp->args); status = nfs4_proc_layoutreturn(lrp, sync); out: @@ -1067,7 +1157,7 @@ _pnfs_return_layout(struct inode *ino) struct nfs_inode *nfsi = NFS_I(ino); LIST_HEAD(tmp_list); nfs4_stateid stateid; - int status = 0, empty; + int status = 0; bool send; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -1081,7 +1171,14 @@ _pnfs_return_layout(struct inode *ino) } /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); - empty = list_empty(&lo->plh_segs); + /* Is there an outstanding layoutreturn ? */ + if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, + TASK_UNINTERRUPTIBLE)) + goto out_put_layout_hdr; + spin_lock(&ino->i_lock); + } pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); @@ -1095,7 +1192,7 @@ _pnfs_return_layout(struct inode *ino) } /* Don't send a LAYOUTRETURN if list was initially empty */ - if (empty) { + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { spin_unlock(&ino->i_lock); dprintk("NFS: %s no layout segments to return\n", __func__); goto out_put_layout_hdr; @@ -1141,21 +1238,36 @@ pnfs_commit_and_return_layout(struct inode *inode) return ret; } -bool pnfs_roc(struct inode *ino) +bool pnfs_roc(struct inode *ino, + struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + const struct rpc_cred *cred) { struct nfs_inode *nfsi = NFS_I(ino); struct nfs_open_context *ctx; struct nfs4_state *state; struct pnfs_layout_hdr *lo; - struct pnfs_layout_segment *lseg, *tmp; + struct pnfs_layout_segment *lseg, *next; nfs4_stateid stateid; - LIST_HEAD(tmp_list); - bool found = false, layoutreturn = false, roc = false; + enum pnfs_iomode iomode = 0; + bool layoutreturn = false, roc = false; + if (!nfs_have_layout(ino)) + return false; +retry: spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + if (!lo || !pnfs_layout_is_valid(lo) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) goto out_noroc; + if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { + pnfs_get_layout_hdr(lo); + spin_unlock(&ino->i_lock); + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, + TASK_UNINTERRUPTIBLE); + pnfs_put_layout_hdr(lo); + goto retry; + } /* no roc if we hold a delegation */ if (nfs4_check_delegation(ino, FMODE_READ)) @@ -1168,78 +1280,73 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - /* always send layoutreturn if being marked so */ - if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) - layoutreturn = pnfs_prepare_layoutreturn(lo, - &stateid, NULL); - list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) { /* If we are sending layoutreturn, invalidate all valid lsegs */ - if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { - mark_lseg_invalid(lseg, &tmp_list); - found = true; - } + if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags)) + continue; + /* + * Note: mark lseg for return so pnfs_layout_remove_lseg + * doesn't invalidate the layout for us. + */ + set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + if (!mark_lseg_invalid(lseg, &lo->plh_return_segs)) + continue; + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); + } + + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + goto out_noroc; + /* ROC in two conditions: * 1. there are ROC lsegs * 2. we don't send layoutreturn */ - if (found && !layoutreturn) { - /* lo ref dropped in pnfs_roc_release() */ - pnfs_get_layout_hdr(lo); - roc = true; - } + /* lo ref dropped in pnfs_roc_release() */ + layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + /* If the creds don't match, we can't compound the layoutreturn */ + if (!layoutreturn || cred != lo->plh_lc_cred) + goto out_noroc; + + roc = layoutreturn; + pnfs_init_layoutreturn_args(args, lo, &stateid, iomode); + res->lrs_present = 0; + layoutreturn = false; out_noroc: spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&tmp_list); pnfs_layoutcommit_inode(ino, true); + if (roc) { + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; + if (ld->prepare_layoutreturn) + ld->prepare_layoutreturn(args); + return true; + } if (layoutreturn) - pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); - return roc; -} - -void pnfs_roc_release(struct inode *ino) -{ - struct pnfs_layout_hdr *lo; - - spin_lock(&ino->i_lock); - lo = NFS_I(ino)->layout; - pnfs_clear_layoutreturn_waitbit(lo); - if (atomic_dec_and_test(&lo->plh_refcount)) { - pnfs_detach_layout_hdr(lo); - spin_unlock(&ino->i_lock); - pnfs_free_layout_hdr(lo); - } else - spin_unlock(&ino->i_lock); + pnfs_send_layoutreturn(lo, &stateid, iomode, true); + return false; } -void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +void pnfs_roc_release(struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + int ret) { - struct pnfs_layout_hdr *lo; + struct pnfs_layout_hdr *lo = args->layout; + const nfs4_stateid *arg_stateid = NULL; + const nfs4_stateid *res_stateid = NULL; + struct nfs4_xdr_opaque_data *ld_private = args->ld_private; - spin_lock(&ino->i_lock); - lo = NFS_I(ino)->layout; - if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) - lo->plh_barrier = barrier; - spin_unlock(&ino->i_lock); - trace_nfs4_layoutreturn_on_close(ino, 0); -} - -void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) -{ - struct nfs_inode *nfsi = NFS_I(ino); - struct pnfs_layout_hdr *lo; - u32 current_seqid; - - spin_lock(&ino->i_lock); - lo = nfsi->layout; - current_seqid = be32_to_cpu(lo->plh_stateid.seqid); - - /* Since close does not return a layout stateid for use as - * a barrier, we choose the worst-case barrier. - */ - *barrier = current_seqid + atomic_read(&lo->plh_outstanding); - spin_unlock(&ino->i_lock); + if (ret == 0) { + arg_stateid = &args->stateid; + if (res->lrs_present) + res_stateid = &res->stateid; + } + pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, + res_stateid); + if (ld_private && ld_private->ops && ld_private->ops->free) + ld_private->ops->free(ld_private); + pnfs_put_layout_hdr(lo); + trace_nfs4_layoutreturn_on_close(args->inode, 0); } bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) @@ -1252,13 +1359,11 @@ bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) * i_lock */ spin_lock(&ino->i_lock); lo = nfsi->layout; - if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); sleep = true; + } spin_unlock(&ino->i_lock); - - if (sleep) - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - return sleep; } @@ -1375,6 +1480,7 @@ alloc_init_layout_hdr(struct inode *ino, atomic_set(&lo->plh_refcount, 1); INIT_LIST_HEAD(&lo->plh_layouts); INIT_LIST_HEAD(&lo->plh_segs); + INIT_LIST_HEAD(&lo->plh_return_segs); INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; lo->plh_lc_cred = get_rpccred(ctx->cred); @@ -1841,7 +1947,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } - if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + if (!pnfs_layout_is_valid(lo)) { + /* We have a completely new layout */ + pnfs_set_layout_stateid(lo, &res->stateid, true); + } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to sequence\n", __func__); @@ -1851,12 +1960,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } else { /* * We got an entirely new state ID. Mark all segments for the - * inode invalid, and don't bother validating the stateid - * sequence number. + * inode invalid, and retry the layoutget */ pnfs_mark_layout_stateid_invalid(lo, &free_me); - - pnfs_set_layout_stateid(lo, &res->stateid, true); + goto out_forget; } pnfs_get_lseg(lseg); @@ -1877,20 +1984,6 @@ out_forget: return ERR_PTR(-EAGAIN); } -static void -pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, - u32 seq) -{ - if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) - iomode = IOMODE_ANY; - lo->plh_return_iomode = iomode; - set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); - if (seq != 0) { - WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); - lo->plh_return_seq = seq; - } -} - /** * pnfs_mark_matching_lsegs_return - Free or return matching layout segments * @lo: pointer to layout header @@ -1945,17 +2038,18 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, .offset = 0, .length = NFS4_MAX_UINT64, }; - LIST_HEAD(free_me); bool return_now = false; spin_lock(&inode->i_lock); pnfs_set_plh_return_info(lo, range.iomode, 0); + /* Block LAYOUTGET */ + set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) { + if (!pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0)) { nfs4_stateid stateid; enum pnfs_iomode iomode; @@ -1967,7 +2061,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, spin_unlock(&inode->i_lock); nfs_commit_inode(inode, 0); } - pnfs_free_lseg_list(&free_me); } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); @@ -2063,7 +2156,7 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, * */ if (pgio->pg_lseg) { - seg_end = end_offset(pgio->pg_lseg->pls_range.offset, + seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset, pgio->pg_lseg->pls_range.length); req_start = req_offset(req); WARN_ON_ONCE(req_start >= seg_end); @@ -2286,6 +2379,10 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) struct nfs_pageio_descriptor pgio; if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + /* Prevent deadlocks with layoutreturn! */ + pnfs_put_lseg(hdr->lseg); + hdr->lseg = NULL; + nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 5c295512c967..63f77b49a586 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -96,6 +96,7 @@ enum { NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_RETURN, /* layoutreturn in progress */ + NFS_LAYOUT_RETURN_LOCK, /* Serialise layoutreturn */ NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ @@ -171,8 +172,8 @@ struct pnfs_layoutdriver_type { (struct nfs_server *server, struct pnfs_device *pdev, gfp_t gfp_flags); - void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, - struct xdr_stream *xdr, + int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *); + void (*encode_layoutreturn) (struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args); void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); @@ -181,7 +182,6 @@ struct pnfs_layoutdriver_type { struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); - void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data); }; struct pnfs_layout_hdr { @@ -190,6 +190,7 @@ struct pnfs_layout_hdr { struct list_head plh_layouts; /* other client layouts */ struct list_head plh_bulk_destroy; struct list_head plh_segs; /* layout segments list */ + struct list_head plh_return_segs; /* invalid layout segments */ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ unsigned long plh_retry_timestamp; unsigned long plh_flags; @@ -270,10 +271,13 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, u32 seq); int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct list_head *lseg_list); -bool pnfs_roc(struct inode *ino); -void pnfs_roc_release(struct inode *ino); -void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); -void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier); +bool pnfs_roc(struct inode *ino, + struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + const struct rpc_cred *cred); +void pnfs_roc_release(struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + int ret); bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task); void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); @@ -292,7 +296,10 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, enum pnfs_iomode iomode, bool strict_iomode, gfp_t gfp_flags); -void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); +void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range, + const nfs4_stateid *stateid); void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, @@ -362,8 +369,7 @@ struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, void nfs4_pnfs_v3_ds_connect_unload(void); void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, - unsigned int retrans, u32 version, u32 minor_version, - rpc_authflavor_t au_flavor); + unsigned int retrans, u32 version, u32 minor_version); struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags); @@ -559,6 +565,38 @@ pnfs_copy_range(struct pnfs_layout_range *dst, memcpy(dst, src, sizeof(*dst)); } +static inline u64 +pnfs_end_offset(u64 start, u64 len) +{ + if (NFS4_MAX_UINT64 - start <= len) + return NFS4_MAX_UINT64; + return start + len; +} + +/* + * Are 2 ranges intersecting? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static inline bool +pnfs_is_range_intersecting(u64 start1, u64 end1, u64 start2, u64 end2) +{ + return (end1 == NFS4_MAX_UINT64 || start2 < end1) && + (end2 == NFS4_MAX_UINT64 || start1 < end2); +} + +static inline bool +pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 end1 = pnfs_end_offset(l1->offset, l1->length); + u64 end2 = pnfs_end_offset(l2->offset, l2->length); + + return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG @@ -630,23 +668,18 @@ pnfs_layoutcommit_outstanding(struct inode *inode) static inline bool -pnfs_roc(struct inode *ino) +pnfs_roc(struct inode *ino, + struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + const struct rpc_cred *cred) { return false; } static inline void -pnfs_roc_release(struct inode *ino) -{ -} - -static inline void -pnfs_roc_set_barrier(struct inode *ino, u32 barrier) -{ -} - -static inline void -pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) +pnfs_roc_release(struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + int ret) { } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 53b4705abcc7..9414b492439f 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -600,8 +600,7 @@ static struct nfs_client *(*get_v3_ds_connect)( int ds_addrlen, int ds_proto, unsigned int ds_timeo, - unsigned int ds_retrans, - rpc_authflavor_t au_flavor); + unsigned int ds_retrans); static bool load_v3_ds_connect(void) { @@ -625,15 +624,13 @@ EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload); static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, unsigned int timeo, - unsigned int retrans, - rpc_authflavor_t au_flavor) + unsigned int retrans) { struct nfs_client *clp = ERR_PTR(-EIO); struct nfs4_pnfs_ds_addr *da; int status = 0; - dprintk("--> %s DS %s au_flavor %d\n", __func__, - ds->ds_remotestr, au_flavor); + dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); if (!load_v3_ds_connect()) goto out; @@ -657,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, clp = get_v3_ds_connect(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, - timeo, retrans, au_flavor); + timeo, retrans); } if (IS_ERR(clp)) { @@ -676,15 +673,13 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, unsigned int timeo, unsigned int retrans, - u32 minor_version, - rpc_authflavor_t au_flavor) + u32 minor_version) { struct nfs_client *clp = ERR_PTR(-EIO); struct nfs4_pnfs_ds_addr *da; int status = 0; - dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, - au_flavor); + dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); list_for_each_entry(da, &ds->ds_addrs, da_node) { dprintk("%s: DS %s: trying address %s\n", @@ -720,8 +715,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, clp = nfs4_set_ds_client(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, - timeo, retrans, minor_version, - au_flavor); + timeo, retrans, minor_version); if (IS_ERR(clp)) continue; @@ -755,19 +749,17 @@ out: */ void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, - unsigned int retrans, u32 version, - u32 minor_version, rpc_authflavor_t au_flavor) + unsigned int retrans, u32 version, u32 minor_version) { if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { int err = 0; if (version == 3) { err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, - retrans, au_flavor); + retrans); } else if (version == 4) { err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, - retrans, minor_version, - au_flavor); + retrans, minor_version); } else { dprintk("%s: unsupported DS version %d\n", __func__, version); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 001796bcd6c8..ddce94ce8142 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2904,7 +2904,7 @@ module_param(max_session_slots, ushort, 0644); MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " "requests the client will negotiate"); module_param(max_session_cb_slots, ushort, 0644); -MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 " +MODULE_PARM_DESC(max_session_cb_slots, "Maximum number of parallel NFSv4.1 " "callbacks the client will process for a given server"); module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 53211838f72a..6e761f3f4cbf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1151,8 +1151,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) if (l_ctx && flctx && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock))) { - do_flush |= l_ctx->lockowner.l_owner != current->files - || l_ctx->lockowner.l_pid != current->tgid; + do_flush |= l_ctx->lockowner != current->files; } nfs_release_request(req); if (!do_flush) diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index fd8c9a5bcac4..420d3a0ab258 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -9,7 +9,7 @@ #include <net/netns/generic.h> #include <linux/fs.h> -static int grace_net_id; +static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); /** diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ee36efd5aece..3714231a9d0f 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -124,5 +124,5 @@ struct nfsd_net { /* Simple check to find out if a given net was properly initialized */ #define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl) -extern int nfsd_net_id; +extern unsigned int nfsd_net_id; #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 36b2af931e06..2857e46d5cc5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1201,7 +1201,7 @@ static int create_proc_exports_entry(void) } #endif -int nfsd_net_id; +unsigned int nfsd_net_id; static __net_init int nfsd_init_net(struct net *net) { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c95d369e90aa..12eeae62a2b1 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -189,7 +189,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag) set_buffer_dirty(nilfs->ns_sbh[0]); if (nilfs_test_opt(nilfs, BARRIER)) { err = __sync_dirty_buffer(nilfs->ns_sbh[0], - WRITE_SYNC | WRITE_FLUSH_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); } else { err = sync_dirty_buffer(nilfs->ns_sbh[0]); } diff --git a/fs/nsfs.c b/fs/nsfs.c index 8718af895eab..8c9fb29c6673 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -118,7 +118,7 @@ again: return ret; } -static int open_related_ns(struct ns_common *ns, +int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { struct path path = {}; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index fe251f187ff8..cc91856b5e2d 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -29,6 +29,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/bit_spinlock.h> +#include <linux/bio.h> #include "aops.h" #include "attrib.h" @@ -764,7 +765,7 @@ lock_retry_remap: } // TODO: Instantiate the hole. // clear_buffer_new(bh); - // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + // clean_bdev_bh_alias(bh); ntfs_error(vol->sb, "Writing into sparse regions is " "not supported yet. Sorry."); err = -EOPNOTSUPP; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index bf72a2c58b75..99510d811a8c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -740,8 +740,7 @@ map_buffer_cached: set_buffer_uptodate(bh); if (unlikely(was_hole)) { /* We allocated the buffer. */ - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (bh_end <= pos || bh_pos >= end) mark_buffer_dirty(bh); else @@ -784,7 +783,7 @@ map_buffer_cached: continue; } /* We allocated the buffer. */ - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); /* * If the buffer is fully outside the write, zero it, * set it uptodate, and mark it dirty so it gets diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index 761f12f7f3ef..353379ff6057 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -27,6 +27,7 @@ #include <linux/buffer_head.h> #include <linux/bitops.h> #include <linux/log2.h> +#include <linux/bio.h> #include "attrib.h" #include "aops.h" diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index d3c009626032..b6f402194f02 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -23,6 +23,7 @@ #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/swap.h> +#include <linux/bio.h> #include "attrib.h" #include "aops.h" diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c5c5b9748ea3..4d9c6f5ec28a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -630,7 +630,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (!buffer_mapped(bh)) { map_bh(bh, inode->i_sb, *p_blkno); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); } if (PageUptodate(page)) { @@ -1950,8 +1950,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, } int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + loff_t pos, unsigned len, unsigned copied, void *fsdata) { int i, ret; unsigned from, to, start = pos & (PAGE_SIZE - 1); @@ -2064,7 +2063,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, int ret; struct inode *inode = mapping->host; - ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata); up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_inode_unlock(inode, 1); @@ -2241,7 +2240,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, dwc->dw_zero_count++; } - ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); + ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc); BUG_ON(ret != len); ret = 0; unlock: diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index b1c9f28a57b1..8614ff069d99 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -44,8 +44,7 @@ int walk_page_buffers( handle_t *handle, struct buffer_head *bh)); int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + loff_t pos, unsigned len, unsigned copied, void *fsdata); typedef enum { OCFS2_WRITE_BUFFER = 0, diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 8f040f88ade4..d9ebe11c8990 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> +#include <linux/bio.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 636abcbd4650..96a155ab5059 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -627,7 +627,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE, - WRITE_SYNC); + REQ_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg, hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; memset(hb_block, 0, reg->hr_block_bytes); /* TODO: time stuff */ - cputime = CURRENT_TIME.tv_sec; + cputime = ktime_get_real_seconds(); if (!cputime) cputime = 1; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3f828a187049..a464c8088170 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1609,8 +1609,6 @@ way_up_top: __dlm_insert_mle(dlm, mle); response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "mle was found\n"); - set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->master == dlm->node_num) { mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); @@ -1625,8 +1623,7 @@ way_up_top: response = DLM_MASTER_RESP_NO; } else response = DLM_MASTER_RESP_MAYBE; - if (set_maybe) - set_bit(request->node_idx, tmpmle->maybe_map); + set_bit(request->node_idx, tmpmle->maybe_map); spin_unlock(&tmpmle->spinlock); } spin_unlock(&dlm->master_lock); @@ -1644,12 +1641,6 @@ send_response: * dlm_assert_master_worker() isn't called, we drop it here. */ if (dispatch_assert) { - if (response != DLM_MASTER_RESP_YES) - mlog(ML_ERROR, "invalid response %d\n", response); - if (!res) { - mlog(ML_ERROR, "bad lockres while trying to assert!\n"); - BUG(); - } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); spin_lock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index dd5cb8bcefd1..74407c6dd592 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->spinlock); dlm_kick_recovery_thread(dlm); break; - default: - BUG(); } mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c56a7679df93..382401d3e88f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail_commit; } - di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); + di->i_dtime = cpu_to_le64(ktime_get_real_seconds()); di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a244f14c6b87..d5e5fa7f0743 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) */ seqno++; os->os_count++; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); unlock: ocfs2_orphan_scan_unlock(osb, seqno); out: @@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) struct ocfs2_orphan_scan *os; os = &osb->osb_orphan_scan; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 71545ad4628c..429088786e93 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, ret = VM_FAULT_NOPAGE; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, - fsdata); + ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); BUG_ON(ret != len); ret = VM_FAULT_LOCKED; out: diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8d887c75765c..3b0a10d9b36f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct ocfs2_extent_list *fel; u16 feat; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct timespec64 ts; *new_fe_bh = NULL; @@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); + ktime_get_real_ts64(&ts); fe->i_atime = fe->i_ctime = fe->i_mtime = - cpu_to_le64(CURRENT_TIME.tv_sec); + cpu_to_le64(ts.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = - cpu_to_le32(CURRENT_TIME.tv_nsec); + cpu_to_le32(ts.tv_nsec); fe->i_dtime = 0; /* diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index e63af7ddfe68..7e5958b0be6b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -224,7 +224,7 @@ struct ocfs2_orphan_scan { struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; - struct timespec os_scantime; /* time this node ran the scan */ + time64_t os_scantime; /* time this node ran the scan */ u32 os_count; /* tracks node specific scans */ u32 os_seqno; /* tracks cluster wide scans */ atomic_t os_state; /* ACTIVE or INACTIVE */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19238512a324..738b4ea8e990 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -478,7 +478,6 @@ again: if (ret) { mlog_errno(ret); ocfs2_unlock_refcount_tree(osb, tree, rw); - ocfs2_refcount_tree_put(tree); goto out; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f56fe39fab04..c894d945b084 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) out += snprintf(buf + out, len - out, "Disabled\n"); else out += snprintf(buf + out, len - out, "%lu seconds ago\n", - (get_seconds() - os->os_scantime.tv_sec)); + (unsigned long)(ktime_get_seconds() - os->os_scantime)); out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", "Slots", "Num", "RecoGen"); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index ef3b4eb54cf2..551bc74ed2b8 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -8,6 +8,7 @@ * Linux VFS inode operations. */ +#include <linux/bvec.h> #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index f1e824979aad..27e75cf28b3a 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -673,8 +673,10 @@ int orangefs_prepare_debugfs_help_string(int at_boot) */ cdm_element_count = orangefs_prepare_cdm_array(client_debug_array_string); - if (cdm_element_count <= 0) + if (cdm_element_count <= 0) { + kfree(new); goto out; + } for (i = 0; i < cdm_element_count; i++) { strlcat(new, "\t", string_size); diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index a799546a67f7..084954448f18 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -609,15 +609,6 @@ static ssize_t sysfs_service_op_store(struct kobject *kobj, new_op->upcall.req.param.u.value32[0] = val1; new_op->upcall.req.param.u.value32[1] = val2; goto value_set; - } else if (!strcmp(attr->attr.name, - "perf_counter_reset")) { - if ((val > 0)) { - new_op->upcall.req.param.op = - ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; - } else { - rc = 0; - goto out; - } } } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index edd46a0e951d..0e100856c7b8 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -328,11 +328,11 @@ static struct dentry *ovl_d_real(struct dentry *dentry, if (!real) goto bug; + /* Handle recursion */ + real = d_real(real, inode, open_flags); + if (!inode || inode == d_inode(real)) return real; - - /* Handle recursion */ - return d_real(real, inode, open_flags); bug: WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry, inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0); diff --git a/fs/proc/array.c b/fs/proc/array.c index 81818adb8e9e..51a4213afa2e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header, if (sigismember(set, i+2)) x |= 2; if (sigismember(set, i+3)) x |= 4; if (sigismember(set, i+4)) x |= 8; - seq_printf(m, "%x", x); + seq_putc(m, hex_asc[x]); } while (i >= 4); seq_putc(m, '\n'); @@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) static inline void task_seccomp(struct seq_file *m, struct task_struct *p) { + seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP - seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode); - seq_putc(m, '\n'); + seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif + seq_putc(m, '\n'); } static inline void task_context_switch_counts(struct seq_file *m, diff --git a/fs/proc/base.c b/fs/proc/base.c index ca651ac00660..5ea836362870 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -104,9 +104,12 @@ * in /proc for a task before it execs a suid executable. */ +static u8 nlink_tid; +static u8 nlink_tgid; + struct pid_entry { const char *name; - int len; + unsigned int len; umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; @@ -139,13 +142,13 @@ struct pid_entry { * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. */ -static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, +static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, unsigned int n) { unsigned int i; unsigned int count; - count = 0; + count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode)) ++count; @@ -1243,7 +1246,7 @@ static const struct file_operations proc_oom_score_adj_operations = { }; #ifdef CONFIG_AUDITSYSCALL -#define TMPBUFLEN 21 +#define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { @@ -1664,7 +1667,8 @@ const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ -struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +struct inode *proc_pid_make_inode(struct super_block * sb, + struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; @@ -1678,6 +1682,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *t /* Common stuff */ ei = PROC_I(inode); + inode->i_mode = mode; inode->i_ino = get_next_ino(); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_op = &proc_def_inode_operations; @@ -1967,7 +1972,7 @@ out: struct map_files_info { fmode_t mode; - unsigned long len; + unsigned int len; unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; @@ -2004,7 +2009,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | + ((mode & FMODE_READ ) ? S_IRUSR : 0) | + ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) return -ENOENT; @@ -2013,12 +2020,6 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; - inode->i_mode = S_IFLNK; - - if (mode & FMODE_READ) - inode->i_mode |= S_IRUSR; - if (mode & FMODE_WRITE) - inode->i_mode |= S_IWUSR; d_set_d_op(dentry, &tid_map_files_dentry_operations); d_add(dentry, inode); @@ -2372,12 +2373,11 @@ static int proc_pident_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, p->mode); if (!inode) goto out; ei = PROC_I(inode); - inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) @@ -2412,14 +2412,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir, * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ - last = &ents[nents - 1]; - for (p = ents; p <= last; p++) { + last = &ents[nents]; + for (p = ents; p < last; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) break; } - if (p > last) + if (p >= last) goto out; error = proc_pident_instantiate(dir, dentry, task, p); @@ -2444,7 +2444,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx, if (ctx->pos >= nents + 2) goto out; - for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { + for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { if (!proc_fill_cache(file, ctx, p->name, p->len, proc_pident_instantiate, task, p)) break; @@ -3059,17 +3059,15 @@ static int proc_pid_instantiate(struct inode *dir, { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) goto out; - inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, - ARRAY_SIZE(tgid_base_stuff))); + set_nlink(inode, nlink_tgid); d_set_d_op(dentry, &pid_dentry_operations); @@ -3352,17 +3350,15 @@ static int proc_task_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) goto out; - inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, - ARRAY_SIZE(tid_base_stuff))); + set_nlink(inode, nlink_tid); d_set_d_op(dentry, &pid_dentry_operations); @@ -3552,3 +3548,9 @@ static const struct file_operations proc_task_operations = { .iterate_shared = proc_task_readdir, .llseek = generic_file_llseek, }; + +void __init set_proc_pid_nlink(void) +{ + nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); + nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} diff --git a/fs/proc/fd.c b/fs/proc/fd.c index d21dafef3102..4274f83bf100 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -183,14 +183,13 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK); if (!inode) goto out; ei = PROC_I(inode); ei->fd = fd; - inode->i_mode = S_IFLNK; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -322,14 +321,13 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR); if (!inode) goto out; ei = PROC_I(inode); ei->fd = fd; - inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; d_set_d_op(dentry, &tid_fd_dentry_operations); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5f2dc2032c79..7eb3cefcf2a3 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -479,6 +479,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) } return ent; } +EXPORT_SYMBOL(proc_create_mount_point); struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e69ebe648a34..783bc19644d1 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde) /* pde is locked */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { + /* + * close() (proc_reg_release()) can't delete an entry and proceed: + * ->release hook needs to be available at the right moment. + * + * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: + * "struct file" needs to be available at the right moment. + * + * Therefore, first process to enter this function does ->release() and + * signals its completion to the other process which does nothing. + */ if (pdeo->closing) { /* somebody else is doing that, just wait */ DECLARE_COMPLETION_ONSTACK(c); @@ -147,12 +157,13 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); } else { struct file *file; - pdeo->closing = 1; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; pde->proc_fops->release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); - list_del_init(&pdeo->lh); + /* After ->release. */ + list_del(&pdeo->lh); if (pdeo->c) complete(pdeo->c); kfree(pdeo); @@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de) if (atomic_add_return(BIAS, &de->in_use) != BIAS) wait_for_completion(&c); + /* ->pde_openers list can't grow from now on. */ + spin_lock(&de->pde_unload_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; @@ -312,16 +325,17 @@ static int proc_reg_open(struct inode *inode, struct file *file) struct pde_opener *pdeo; /* - * What for, you ask? Well, we can have open, rmmod, remove_proc_entry - * sequence. ->release won't be called because ->proc_fops will be - * cleared. Depending on complexity of ->release, consequences vary. + * Ensure that + * 1) PDE's ->release hook will be called no matter what + * either normally by close()/->release, or forcefully by + * rmmod/remove_proc_entry. + * + * 2) rmmod isn't blocked by opening file in /proc and sitting on + * the descriptor (including "rmmod foo </proc/foo" scenario). * - * We can't wait for mercy when close will be done for real, it's - * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release - * by hand in remove_proc_entry(). For this, save opener's credentials - * for later. + * Save every "struct file" with custom ->release hook. */ - pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); if (!pdeo) return -ENOMEM; @@ -338,7 +352,8 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (rv == 0 && release) { /* To know what to release. */ pdeo->file = file; - /* Strictly for "too late" ->release in proc_reg_release(). */ + pdeo->closing = false; + pdeo->c = NULL; spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5378441ec1b7..2de5194ba378 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -162,7 +162,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int proc_setattr(struct dentry *, struct iattr *); -extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern int pid_revalidate(struct dentry *, unsigned int); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); @@ -195,7 +195,6 @@ static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } -struct proc_dir_entry *proc_create_mount_point(const char *name); /* * inode.c @@ -203,7 +202,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name); struct pde_opener { struct file *file; struct list_head lh; - int closing; + bool closing; struct completion *c; }; extern const struct inode_operations proc_link_inode_operations; @@ -211,6 +210,7 @@ extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern void proc_init_inodecache(void); +void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 51b8b0a8ad91..766f0c637ad1 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -92,12 +92,11 @@ static int proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO); if (!inode) goto out; ei = PROC_I(inode); - inode->i_mode = S_IFLNK|S_IRWXUGO; inode->i_op = &proc_ns_link_inode_operations; ei->ns_ops = ns_ops; diff --git a/fs/proc/root.c b/fs/proc/root.c index 8d3e484055a6..4bd0373576b5 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -122,6 +122,7 @@ void __init proc_root_init(void) int err; proc_init_inodecache(); + set_proc_pid_nlink(); err = register_filesystem(&proc_fs_type); if (err) return; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 35b92d81692f..958f32545064 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); + cond_resched(); return 0; } #ifdef CONFIG_HUGETLB_PAGE diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index be40813eff52..b42e5bd6d8ff 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -86,4 +86,4 @@ config PSTORE_RAM Note that for historical reasons, the module will be named "ramoops.ko". - For more information, see Documentation/ramoops.txt. + For more information, see Documentation/admin-guide/ramoops.rst. diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index d4887705bb61..899d0ba0bd6c 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -27,6 +27,9 @@ #include <asm/barrier.h> #include "internal.h" +/* This doesn't need to be atomic: speed is chosen over correctness here. */ +static u64 pstore_ftrace_stamp; + static void notrace pstore_ftrace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, @@ -42,6 +45,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, rec.ip = ip; rec.parent_ip = parent_ip; + pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++); pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, 0, sizeof(rec), psinfo); @@ -71,10 +75,13 @@ static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, if (!on ^ pstore_ftrace_enabled) goto out; - if (on) + if (on) { + ftrace_ops_set_global_filter(&pstore_ftrace_ops); ret = register_ftrace_function(&pstore_ftrace_ops); - else + } else { ret = unregister_ftrace_function(&pstore_ftrace_ops); + } + if (ret) { pr_err("%s: unable to %sregister ftrace ops: %zd\n", __func__, on ? "" : "un", ret); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1781dc50762e..57c0646479f5 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -107,9 +107,11 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v) struct pstore_ftrace_seq_data *data = v; struct pstore_ftrace_record *rec = (void *)(ps->data + data->off); - seq_printf(s, "%d %08lx %08lx %pf <- %pF\n", - pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip, - (void *)rec->ip, (void *)rec->parent_ip); + seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n", + pstore_ftrace_decode_cpu(rec), + pstore_ftrace_read_timestamp(rec), + rec->ip, rec->parent_ip, (void *)rec->ip, + (void *)rec->parent_ip); return 0; } @@ -197,11 +199,14 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) if (err) return err; - if (p->psi->erase) + if (p->psi->erase) { + mutex_lock(&p->psi->read_mutex); p->psi->erase(p->type, p->id, p->count, d_inode(dentry)->i_ctime, p->psi); - else + mutex_unlock(&p->psi->read_mutex); + } else { return -EPERM; + } return simple_unlink(dir, dentry); } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index e38a22b31282..da416e6591c9 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -5,40 +5,6 @@ #include <linux/time.h> #include <linux/pstore.h> -#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB) -#define PSTORE_CPU_IN_IP 0x1 -#elif NR_CPUS <= 4 && defined(CONFIG_ARM) -#define PSTORE_CPU_IN_IP 0x3 -#endif - -struct pstore_ftrace_record { - unsigned long ip; - unsigned long parent_ip; -#ifndef PSTORE_CPU_IN_IP - unsigned int cpu; -#endif -}; - -static inline void -pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu) -{ -#ifndef PSTORE_CPU_IN_IP - rec->cpu = cpu; -#else - rec->ip |= cpu; -#endif -} - -static inline unsigned int -pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec) -{ -#ifndef PSTORE_CPU_IN_IP - return rec->cpu; -#else - return rec->ip & PSTORE_CPU_IN_IP; -#endif -} - #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); extern void pstore_unregister_ftrace(void); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 14984d902a99..729677e18e36 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -493,6 +493,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, if (!is_locked) { pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" , in_nmi() ? "NMI" : why); + return; } } else { spin_lock_irqsave(&psinfo->buf_lock, flags); @@ -584,8 +585,8 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } else { spin_lock_irqsave(&psinfo->buf_lock, flags); } - memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo); + psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0, + s, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 6ad831b9d1b8..27c059e1760a 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -85,10 +85,10 @@ MODULE_PARM_DESC(ramoops_ecc, "bytes ECC)"); struct ramoops_context { - struct persistent_ram_zone **przs; - struct persistent_ram_zone *cprz; - struct persistent_ram_zone *fprz; - struct persistent_ram_zone *mprz; + struct persistent_ram_zone **dprzs; /* Oops dump zones */ + struct persistent_ram_zone *cprz; /* Console zone */ + struct persistent_ram_zone **fprzs; /* Ftrace zones */ + struct persistent_ram_zone *mprz; /* PMSG zone */ phys_addr_t phys_addr; unsigned long size; unsigned int memtype; @@ -97,12 +97,14 @@ struct ramoops_context { size_t ftrace_size; size_t pmsg_size; int dump_oops; + u32 flags; struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; unsigned int dump_write_cnt; /* _read_cnt need clear on ramoops_pstore_open */ unsigned int dump_read_cnt; unsigned int console_read_cnt; + unsigned int max_ftrace_cnt; unsigned int ftrace_read_cnt; unsigned int pmsg_read_cnt; struct pstore_info pstore; @@ -180,16 +182,69 @@ static bool prz_ok(struct persistent_ram_zone *prz) persistent_ram_ecc_string(prz, NULL, 0)); } +static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, + struct persistent_ram_zone *src) +{ + size_t dest_size, src_size, total, dest_off, src_off; + size_t dest_idx = 0, src_idx = 0, merged_idx = 0; + void *merged_buf; + struct pstore_ftrace_record *drec, *srec, *mrec; + size_t record_size = sizeof(struct pstore_ftrace_record); + + dest_off = dest->old_log_size % record_size; + dest_size = dest->old_log_size - dest_off; + + src_off = src->old_log_size % record_size; + src_size = src->old_log_size - src_off; + + total = dest_size + src_size; + merged_buf = kmalloc(total, GFP_KERNEL); + if (!merged_buf) + return -ENOMEM; + + drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off); + srec = (struct pstore_ftrace_record *)(src->old_log + src_off); + mrec = (struct pstore_ftrace_record *)(merged_buf); + + while (dest_size > 0 && src_size > 0) { + if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < + pstore_ftrace_read_timestamp(&srec[src_idx])) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } else { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + } + + while (dest_size > 0) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } + + while (src_size > 0) { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + + kfree(dest->old_log); + dest->old_log = merged_buf; + dest->old_log_size = total; + + return 0; +} + static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, int *count, struct timespec *time, char **buf, bool *compressed, ssize_t *ecc_notice_size, struct pstore_info *psi) { - ssize_t size; + ssize_t size = 0; struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz = NULL; int header_length = 0; + bool free_prz = false; /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have @@ -201,7 +256,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, /* Find the next valid persistent_ram_zone for DMESG */ while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) { - prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, + prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt, cxt->max_dump_cnt, id, type, PSTORE_TYPE_DMESG, 1); if (!prz_ok(prz)) @@ -219,14 +274,56 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, 1, id, type, PSTORE_TYPE_CONSOLE, 0); - if (!prz_ok(prz)) - prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt, - 1, id, type, PSTORE_TYPE_FTRACE, 0); + if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt, 1, id, type, PSTORE_TYPE_PMSG, 0); - if (!prz_ok(prz)) - return 0; + + /* ftrace is last since it may want to dynamically allocate memory. */ + if (!prz_ok(prz)) { + if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) { + prz = ramoops_get_next_prz(cxt->fprzs, + &cxt->ftrace_read_cnt, 1, id, type, + PSTORE_TYPE_FTRACE, 0); + } else { + /* + * Build a new dummy record which combines all the + * per-cpu records including metadata and ecc info. + */ + struct persistent_ram_zone *tmp_prz, *prz_next; + + tmp_prz = kzalloc(sizeof(struct persistent_ram_zone), + GFP_KERNEL); + if (!tmp_prz) + return -ENOMEM; + free_prz = true; + + while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) { + prz_next = ramoops_get_next_prz(cxt->fprzs, + &cxt->ftrace_read_cnt, + cxt->max_ftrace_cnt, id, + type, PSTORE_TYPE_FTRACE, 0); + + if (!prz_ok(prz_next)) + continue; + + tmp_prz->ecc_info = prz_next->ecc_info; + tmp_prz->corrected_bytes += + prz_next->corrected_bytes; + tmp_prz->bad_blocks += prz_next->bad_blocks; + size = ftrace_log_combine(tmp_prz, prz_next); + if (size) + goto out; + } + *id = 0; + prz = tmp_prz; + } + } + + if (!prz_ok(prz)) { + size = 0; + goto out; + } size = persistent_ram_old_size(prz) - header_length; @@ -234,12 +331,21 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL); - if (*buf == NULL) - return -ENOMEM; + if (*buf == NULL) { + size = -ENOMEM; + goto out; + } memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size); + persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1); +out: + if (free_prz) { + kfree(prz->old_log); + kfree(prz); + } + return size; } @@ -283,15 +389,23 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, persistent_ram_write(cxt->cprz, buf, size); return 0; } else if (type == PSTORE_TYPE_FTRACE) { - if (!cxt->fprz) + int zonenum; + + if (!cxt->fprzs) return -ENOMEM; - persistent_ram_write(cxt->fprz, buf, size); + /* + * Choose zone by if we're using per-cpu buffers. + */ + if (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) + zonenum = smp_processor_id(); + else + zonenum = 0; + + persistent_ram_write(cxt->fprzs[zonenum], buf, size); return 0; } else if (type == PSTORE_TYPE_PMSG) { - if (!cxt->mprz) - return -ENOMEM; - persistent_ram_write(cxt->mprz, buf, size); - return 0; + pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__); + return -EINVAL; } if (type != PSTORE_TYPE_DMESG) @@ -316,10 +430,10 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, if (part != 1) return -ENOSPC; - if (!cxt->przs) + if (!cxt->dprzs) return -ENOSPC; - prz = cxt->przs[cxt->dump_write_cnt]; + prz = cxt->dprzs[cxt->dump_write_cnt]; hlen = ramoops_write_kmsg_hdr(prz, compressed); if (size + hlen > prz->buffer_size) @@ -359,13 +473,15 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, case PSTORE_TYPE_DMESG: if (id >= cxt->max_dump_cnt) return -EINVAL; - prz = cxt->przs[id]; + prz = cxt->dprzs[id]; break; case PSTORE_TYPE_CONSOLE: prz = cxt->cprz; break; case PSTORE_TYPE_FTRACE: - prz = cxt->fprz; + if (id >= cxt->max_ftrace_cnt) + return -EINVAL; + prz = cxt->fprzs[id]; break; case PSTORE_TYPE_PMSG: prz = cxt->mprz; @@ -396,68 +512,113 @@ static void ramoops_free_przs(struct ramoops_context *cxt) { int i; - if (!cxt->przs) - return; + /* Free dump PRZs */ + if (cxt->dprzs) { + for (i = 0; i < cxt->max_dump_cnt; i++) + persistent_ram_free(cxt->dprzs[i]); - for (i = 0; i < cxt->max_dump_cnt; i++) - persistent_ram_free(cxt->przs[i]); + kfree(cxt->dprzs); + cxt->max_dump_cnt = 0; + } - kfree(cxt->przs); - cxt->max_dump_cnt = 0; + /* Free ftrace PRZs */ + if (cxt->fprzs) { + for (i = 0; i < cxt->max_ftrace_cnt; i++) + persistent_ram_free(cxt->fprzs[i]); + kfree(cxt->fprzs); + cxt->max_ftrace_cnt = 0; + } } -static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, - phys_addr_t *paddr, size_t dump_mem_sz) +static int ramoops_init_przs(const char *name, + struct device *dev, struct ramoops_context *cxt, + struct persistent_ram_zone ***przs, + phys_addr_t *paddr, size_t mem_sz, + ssize_t record_size, + unsigned int *cnt, u32 sig, u32 flags) { int err = -ENOMEM; int i; + size_t zone_sz; + struct persistent_ram_zone **prz_ar; - if (!cxt->record_size) + /* Allocate nothing for 0 mem_sz or 0 record_size. */ + if (mem_sz == 0 || record_size == 0) { + *cnt = 0; return 0; + } - if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) { - dev_err(dev, "no room for dumps\n"); - return -ENOMEM; + /* + * If we have a negative record size, calculate it based on + * mem_sz / *cnt. If we have a positive record size, calculate + * cnt from mem_sz / record_size. + */ + if (record_size < 0) { + if (*cnt == 0) + return 0; + record_size = mem_sz / *cnt; + if (record_size == 0) { + dev_err(dev, "%s record size == 0 (%zu / %u)\n", + name, mem_sz, *cnt); + goto fail; + } + } else { + *cnt = mem_sz / record_size; + if (*cnt == 0) { + dev_err(dev, "%s record count == 0 (%zu / %zu)\n", + name, mem_sz, record_size); + goto fail; + } } - cxt->max_dump_cnt = dump_mem_sz / cxt->record_size; - if (!cxt->max_dump_cnt) - return -ENOMEM; + if (*paddr + mem_sz - cxt->phys_addr > cxt->size) { + dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n", + name, + mem_sz, (unsigned long long)*paddr, + cxt->size, (unsigned long long)cxt->phys_addr); + goto fail; + } - cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt, - GFP_KERNEL); - if (!cxt->przs) { - dev_err(dev, "failed to initialize a prz array for dumps\n"); - goto fail_mem; + zone_sz = mem_sz / *cnt; + if (!zone_sz) { + dev_err(dev, "%s zone size == 0\n", name); + goto fail; } - for (i = 0; i < cxt->max_dump_cnt; i++) { - cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0, + prz_ar = kcalloc(*cnt, sizeof(**przs), GFP_KERNEL); + if (!prz_ar) + goto fail; + + for (i = 0; i < *cnt; i++) { + prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig, &cxt->ecc_info, - cxt->memtype); - if (IS_ERR(cxt->przs[i])) { - err = PTR_ERR(cxt->przs[i]); - dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", - cxt->record_size, (unsigned long long)*paddr, err); + cxt->memtype, flags); + if (IS_ERR(prz_ar[i])) { + err = PTR_ERR(prz_ar[i]); + dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", + name, record_size, + (unsigned long long)*paddr, err); while (i > 0) { i--; - persistent_ram_free(cxt->przs[i]); + persistent_ram_free(prz_ar[i]); } - goto fail_prz; + kfree(prz_ar); + goto fail; } - *paddr += cxt->record_size; + *paddr += zone_sz; } + *przs = prz_ar; return 0; -fail_prz: - kfree(cxt->przs); -fail_mem: - cxt->max_dump_cnt = 0; + +fail: + *cnt = 0; return err; } -static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, +static int ramoops_init_prz(const char *name, + struct device *dev, struct ramoops_context *cxt, struct persistent_ram_zone **prz, phys_addr_t *paddr, size_t sz, u32 sig) { @@ -465,18 +626,19 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return 0; if (*paddr + sz - cxt->phys_addr > cxt->size) { - dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n", - sz, (unsigned long long)*paddr, + dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n", + name, sz, (unsigned long long)*paddr, cxt->size, (unsigned long long)cxt->phys_addr); return -ENOMEM; } - *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype); + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, + cxt->memtype, 0); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); - dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", - sz, (unsigned long long)*paddr, err); + dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", + name, sz, (unsigned long long)*paddr, err); return err; } @@ -543,6 +705,7 @@ static int ramoops_parse_dt(struct platform_device *pdev, parse_size("ftrace-size", pdata->ftrace_size); parse_size("pmsg-size", pdata->pmsg_size); parse_size("ecc-size", pdata->ecc_info.ecc_size); + parse_size("flags", pdata->flags); #undef parse_size @@ -561,6 +724,7 @@ static int ramoops_probe(struct platform_device *pdev) if (dev_of_node(dev) && !pdata) { pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) { + pr_err("cannot allocate platform data buffer\n"); err = -ENOMEM; goto fail_out; } @@ -570,11 +734,20 @@ static int ramoops_probe(struct platform_device *pdev) goto fail_out; } - /* Only a single ramoops area allowed at a time, so fail extra + /* + * Only a single ramoops area allowed at a time, so fail extra * probes. */ - if (cxt->max_dump_cnt) + if (cxt->max_dump_cnt) { + pr_err("already initialized\n"); goto fail_out; + } + + /* Make sure we didn't get bogus platform data pointer. */ + if (!pdata) { + pr_err("NULL platform data\n"); + goto fail_out; + } if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size && !pdata->ftrace_size && !pdata->pmsg_size)) { @@ -600,27 +773,37 @@ static int ramoops_probe(struct platform_device *pdev) cxt->ftrace_size = pdata->ftrace_size; cxt->pmsg_size = pdata->pmsg_size; cxt->dump_oops = pdata->dump_oops; + cxt->flags = pdata->flags; cxt->ecc_info = pdata->ecc_info; paddr = cxt->phys_addr; dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size - cxt->pmsg_size; - err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz); + err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr, + dump_mem_sz, cxt->record_size, + &cxt->max_dump_cnt, 0, 0); if (err) goto fail_out; - err = ramoops_init_prz(dev, cxt, &cxt->cprz, &paddr, + err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr, cxt->console_size, 0); if (err) goto fail_init_cprz; - err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size, - LINUX_VERSION_CODE); + cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) + ? nr_cpu_ids + : 1; + err = ramoops_init_przs("ftrace", dev, cxt, &cxt->fprzs, &paddr, + cxt->ftrace_size, -1, + &cxt->max_ftrace_cnt, LINUX_VERSION_CODE, + (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) + ? PRZ_FLAG_NO_LOCK : 0); if (err) goto fail_init_fprz; - err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0); + err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr, + cxt->pmsg_size, 0); if (err) goto fail_init_mprz; @@ -680,7 +863,6 @@ fail_clear: cxt->pstore.bufsize = 0; persistent_ram_free(cxt->mprz); fail_init_mprz: - persistent_ram_free(cxt->fprz); fail_init_fprz: persistent_ram_free(cxt->cprz); fail_init_cprz: @@ -699,7 +881,6 @@ static int ramoops_remove(struct platform_device *pdev) cxt->pstore.bufsize = 0; persistent_ram_free(cxt->mprz); - persistent_ram_free(cxt->fprz); persistent_ram_free(cxt->cprz); ramoops_free_przs(cxt); @@ -741,6 +922,8 @@ static void ramoops_register_dummy(void) dummy_data->ftrace_size = ramoops_ftrace_size; dummy_data->pmsg_size = ramoops_pmsg_size; dummy_data->dump_oops = dump_oops; + dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU; + /* * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC * (using 1 byte for ECC isn't much of use anyway). diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 3975deec02f8..a857338b7dab 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -48,16 +48,15 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz) return atomic_read(&prz->buffer->start); } -static DEFINE_RAW_SPINLOCK(buffer_lock); - /* increase and wrap the start pointer, returning the old value */ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) { int old; int new; - unsigned long flags; + unsigned long flags = 0; - raw_spin_lock_irqsave(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_lock_irqsave(&prz->buffer_lock, flags); old = atomic_read(&prz->buffer->start); new = old + a; @@ -65,7 +64,8 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) new -= prz->buffer_size; atomic_set(&prz->buffer->start, new); - raw_spin_unlock_irqrestore(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags); return old; } @@ -75,9 +75,10 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a) { size_t old; size_t new; - unsigned long flags; + unsigned long flags = 0; - raw_spin_lock_irqsave(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_lock_irqsave(&prz->buffer_lock, flags); old = atomic_read(&prz->buffer->size); if (old == prz->buffer_size) @@ -89,7 +90,8 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a) atomic_set(&prz->buffer->size, new); exit: - raw_spin_unlock_irqrestore(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags); } static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, @@ -465,7 +467,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, } static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, - struct persistent_ram_ecc_info *ecc_info) + struct persistent_ram_ecc_info *ecc_info, + unsigned long flags) { int ret; @@ -493,6 +496,8 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, prz->buffer->sig = sig; persistent_ram_zap(prz); + prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock); + prz->flags = flags; return 0; } @@ -517,7 +522,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, u32 sig, struct persistent_ram_ecc_info *ecc_info, - unsigned int memtype) + unsigned int memtype, u32 flags) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -532,7 +537,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, if (ret) goto err; - ret = persistent_ram_post_init(prz, sig, ecc_info); + ret = persistent_ram_post_init(prz, sig, ecc_info, flags); if (ret) goto err; diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index 8b252673d454..e99b1a72d9a7 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -12,14 +12,8 @@ static const struct genl_multicast_group quota_mcgrps[] = { }; /* Netlink family structure for quota */ -static struct genl_family quota_genl_family = { - /* - * Needed due to multicast group ID abuse - old code assumed - * the family ID was also a valid multicast group ID (which - * isn't true) and userspace might thus rely on it. Assign a - * static ID for this group to make dealing with that easier. - */ - .id = GENL_ID_VFS_DQUOT, +static struct genl_family quota_genl_family __ro_after_init = { + .module = THIS_MODULE, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 58b2dedb2a3a..cfeae9b0a2b7 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -19,6 +19,7 @@ #include <linux/quotaops.h> #include <linux/swap.h> #include <linux/uio.h> +#include <linux/bio.h> int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index bc2dde2423c2..aa40c242f1db 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1111,7 +1111,8 @@ static int flush_commit_list(struct super_block *s, mark_buffer_dirty(jl->j_commit_bh) ; depth = reiserfs_write_unlock_nested(s); if (reiserfs_barrier_flush(s)) - __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); + __sync_dirty_buffer(jl->j_commit_bh, + REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(jl->j_commit_bh); reiserfs_write_lock_nested(s, depth); @@ -1269,7 +1270,8 @@ static int _update_journal_header_block(struct super_block *sb, depth = reiserfs_write_unlock_nested(sb); if (reiserfs_barrier_flush(sb)) - __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); + __sync_dirty_buffer(journal->j_header_bh, + REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(journal->j_header_bh); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index a97e352d05d3..0037aea97d39 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -11,6 +11,7 @@ #include <linux/time.h> #include <linux/string.h> #include <linux/pagemap.h> +#include <linux/bio.h> #include "reiserfs.h" #include <linux/buffer_head.h> #include <linux/quotaops.h> diff --git a/fs/splice.c b/fs/splice.c index 5a7750bd2eea..8ed7c9d8c0fb 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -17,6 +17,7 @@ * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ +#include <linux/bvec.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index ce62a380314f..2751476e6b6e 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -31,6 +31,7 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/buffer_head.h> +#include <linux/bio.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 7ff7712f284e..0a908ae7af13 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -50,3 +50,14 @@ config UBIFS_ATIME_SUPPORT strictatime is the "heavy", relatime is "lighter", etc. If unsure, say 'N' + +config UBIFS_FS_ENCRYPTION + bool "UBIFS Encryption" + depends on UBIFS_FS + select FS_ENCRYPTION + default n + help + Enable encryption of UBIFS files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index c54a24360f85..6f3251c2bf08 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -5,3 +5,4 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o ubifs-y += misc.o +ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c new file mode 100644 index 000000000000..3402720f2b28 --- /dev/null +++ b/fs/ubifs/crypto.c @@ -0,0 +1,97 @@ +#include "ubifs.h" + +static int ubifs_crypt_get_context(struct inode *inode, void *ctx, size_t len) +{ + return ubifs_xattr_get(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len); +} + +static int ubifs_crypt_set_context(struct inode *inode, const void *ctx, + size_t len, void *fs_data) +{ + return ubifs_xattr_set(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); +} + +static bool ubifs_crypt_empty_dir(struct inode *inode) +{ + return ubifs_check_dir_empty(inode) == 0; +} + +static unsigned int ubifs_crypt_max_namelen(struct inode *inode) +{ + if (S_ISLNK(inode->i_mode)) + return UBIFS_MAX_INO_DATA; + else + return UBIFS_MAX_NLEN; +} + +static int ubifs_key_prefix(struct inode *inode, u8 **key) +{ + static char prefix[] = "ubifs:"; + + *key = prefix; + + return sizeof(prefix) - 1; +} + +int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, + unsigned int in_len, unsigned int *out_len, int block) +{ + struct ubifs_info *c = inode->i_sb->s_fs_info; + void *p = &dn->data; + struct page *ret; + unsigned int pad_len = round_up(in_len, UBIFS_CIPHER_BLOCK_SIZE); + + ubifs_assert(pad_len <= *out_len); + dn->compr_size = cpu_to_le16(in_len); + + /* pad to full block cipher length */ + if (pad_len != in_len) + memset(p + in_len, 0, pad_len - in_len); + + ret = fscrypt_encrypt_page(inode, virt_to_page(&dn->data), pad_len, + offset_in_page(&dn->data), block, GFP_NOFS); + if (IS_ERR(ret)) { + ubifs_err(c, "fscrypt_encrypt_page failed: %ld", PTR_ERR(ret)); + return PTR_ERR(ret); + } + *out_len = pad_len; + + return 0; +} + +int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, + unsigned int *out_len, int block) +{ + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err; + unsigned int clen = le16_to_cpu(dn->compr_size); + unsigned int dlen = *out_len; + + if (clen <= 0 || clen > UBIFS_BLOCK_SIZE || clen > dlen) { + ubifs_err(c, "bad compr_size: %i", clen); + return -EINVAL; + } + + ubifs_assert(dlen <= UBIFS_BLOCK_SIZE); + err = fscrypt_decrypt_page(inode, virt_to_page(&dn->data), dlen, + offset_in_page(&dn->data), block); + if (err) { + ubifs_err(c, "fscrypt_decrypt_page failed: %i", err); + return err; + } + *out_len = clen; + + return 0; +} + +struct fscrypt_operations ubifs_crypt_operations = { + .flags = FS_CFLG_OWN_PAGES, + .get_context = ubifs_crypt_get_context, + .set_context = ubifs_crypt_set_context, + .is_encrypted = __ubifs_crypt_is_encrypted, + .empty_dir = ubifs_crypt_empty_dir, + .max_namelen = ubifs_crypt_max_namelen, + .key_prefix = ubifs_key_prefix, +}; diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 69e287e20732..1e712a364680 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -233,7 +233,7 @@ static void dump_ch(const struct ubifs_ch *ch) void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); - struct qstr nm = { .name = NULL }; + struct fscrypt_name nm = {0}; union ubifs_key key; struct ubifs_dent_node *dent, *pdent = NULL; int count = 2; @@ -289,8 +289,8 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) pr_err("\t%d: %s (%s)\n", count++, dent->name, get_dent_type(dent->type)); - nm.name = dent->name; - nm.len = le16_to_cpu(dent->nlen); + fname_name(&nm) = dent->name; + fname_len(&nm) = le16_to_cpu(dent->nlen); kfree(pdent); pdent = dent; key_read(c, &dent->key, &key); @@ -1107,7 +1107,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) unsigned int nlink = 2; union ubifs_key key; struct ubifs_dent_node *dent, *pdent = NULL; - struct qstr nm = { .name = NULL }; + struct fscrypt_name nm = {0}; loff_t size = UBIFS_INO_NODE_SZ; if (!dbg_is_chk_gen(c)) @@ -1128,9 +1128,9 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) return err; } - nm.name = dent->name; - nm.len = le16_to_cpu(dent->nlen); - size += CALC_DENT_SIZE(nm.len); + fname_name(&nm) = dent->name; + fname_len(&nm) = le16_to_cpu(dent->nlen); + size += CALC_DENT_SIZE(fname_len(&nm)); if (dent->type == UBIFS_ITYPE_DIR) nlink += 1; kfree(pdent); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ca16c5d7bab1..1c5331ac9614 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -85,11 +85,26 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * initializes it. Returns new inode in case of success and an error code in * case of failure. */ -struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, +struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode) { + int err; struct inode *inode; struct ubifs_inode *ui; + bool encrypted = false; + + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) { + ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err); + return ERR_PTR(err); + } + + if (!fscrypt_has_encryption_key(dir)) + return ERR_PTR(-EPERM); + + encrypted = true; + } inode = new_inode(c->vfs_sb); ui = ubifs_inode(inode); @@ -165,18 +180,29 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, */ ui->creat_sqnum = ++c->max_sqnum; spin_unlock(&c->cnt_lock); + + if (encrypted) { + err = fscrypt_inherit_context(dir, inode, &encrypted, true); + if (err) { + ubifs_err(c, "fscrypt_inherit_context failed: %i", err); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); + } + } + return inode; } static int dbg_check_name(const struct ubifs_info *c, const struct ubifs_dent_node *dent, - const struct qstr *nm) + const struct fscrypt_name *nm) { if (!dbg_is_chk_gen(c)) return 0; - if (le16_to_cpu(dent->nlen) != nm->len) + if (le16_to_cpu(dent->nlen) != fname_len(nm)) return -EINVAL; - if (memcmp(dent->name, nm->name, nm->len)) + if (memcmp(dent->name, fname_name(nm), fname_len(nm))) return -EINVAL; return 0; } @@ -189,30 +215,61 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct ubifs_dent_node *dent; struct ubifs_info *c = dir->i_sb->s_fs_info; + struct fscrypt_name nm; dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); - if (dentry->d_name.len > UBIFS_MAX_NLEN) - return ERR_PTR(-ENAMETOOLONG); + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + + /* + * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is + * created while the directory was encrypted and we + * have access to the key. + */ + if (fscrypt_has_encryption_key(dir)) + fscrypt_set_encrypted_dentry(dentry); + fscrypt_set_d_op(dentry); + if (err && err != -ENOKEY) + return ERR_PTR(err); + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm); + if (err) + return ERR_PTR(err); + + if (fname_len(&nm) > UBIFS_MAX_NLEN) { + err = -ENAMETOOLONG; + goto out_fname; + } dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); - if (!dent) - return ERR_PTR(-ENOMEM); + if (!dent) { + err = -ENOMEM; + goto out_fname; + } - dent_key_init(c, &key, dir->i_ino, &dentry->d_name); + if (nm.hash) { + ubifs_assert(fname_len(&nm) == 0); + ubifs_assert(fname_name(&nm) == NULL); + dent_key_init_hash(c, &key, dir->i_ino, nm.hash); + err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash); + } else { + dent_key_init(c, &key, dir->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, dent, &nm); + } - err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); if (err) { if (err == -ENOENT) { dbg_gen("not found"); goto done; } - goto out; + goto out_dent; } - if (dbg_check_name(c, dent, &dentry->d_name)) { + if (dbg_check_name(c, dent, &nm)) { err = -EINVAL; - goto out; + goto out_dent; } inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); @@ -225,11 +282,12 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, ubifs_err(c, "dead directory entry '%pd', error %d", dentry, err); ubifs_ro_mode(c, err); - goto out; + goto out_dent; } done: kfree(dent); + fscrypt_free_filename(&nm); /* * Note, d_splice_alias() would be required instead if we supported * NFS. @@ -237,8 +295,10 @@ done: d_add(dentry, inode); return NULL; -out: +out_dent: kfree(dent); +out_fname: + fscrypt_free_filename(&nm); return ERR_PTR(err); } @@ -247,10 +307,11 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, .dirtied_ino = 1 }; struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct fscrypt_name nm; + int err, sz_change; /* * Budget request settings: new inode, new direntry, changing the @@ -264,10 +325,16 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (err) return err; + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + goto out_budg; + + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + inode = ubifs_new_inode(c, dir, mode); if (IS_ERR(inode)) { err = PTR_ERR(inode); - goto out_budg; + goto out_fname; } err = ubifs_init_security(dir, inode, &dentry->d_name); @@ -278,12 +345,13 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); ubifs_release_budget(c, &req); + fscrypt_free_filename(&nm); insert_inode_hash(inode); d_instantiate(dentry, inode); return 0; @@ -295,6 +363,8 @@ out_cancel: out_inode: make_bad_inode(inode); iput(inode); +out_fname: + fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); ubifs_err(c, "cannot create regular file, error %d", err); @@ -310,6 +380,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); int err, instantiated = 0; + struct fscrypt_name nm; /* * Budget request settings: new dirty inode, new direntry, @@ -319,13 +390,30 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, dbg_gen("dent '%pd', mode %#hx in dir ino %lu", dentry, mode, dir->i_ino); - err = ubifs_budget_space(c, &req); + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) + return err; + + if (!fscrypt_has_encryption_key(dir)) { + return -EPERM; + } + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); if (err) return err; + err = ubifs_budget_space(c, &req); + if (err) { + fscrypt_free_filename(&nm); + return err; + } + err = ubifs_budget_space(c, &ino_req); if (err) { ubifs_release_budget(c, &req); + fscrypt_free_filename(&nm); return err; } @@ -361,7 +449,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, mutex_unlock(&ui->ui_mutex); mutex_lock(&dir_ui->ui_mutex); - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -380,6 +468,7 @@ out_budg: ubifs_release_budget(c, &req); if (!instantiated) ubifs_release_budget(c, &ino_req); + fscrypt_free_filename(&nm); ubifs_err(c, "cannot create temporary file, error %d", err); return err; } @@ -439,12 +528,14 @@ static unsigned int vfs_dent_type(uint8_t type) */ static int ubifs_readdir(struct file *file, struct dir_context *ctx) { - int err = 0; - struct qstr nm; + int fstr_real_len = 0, err = 0; + struct fscrypt_name nm; + struct fscrypt_str fstr = {0}; union ubifs_key key; struct ubifs_dent_node *dent; struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; + bool encrypted = ubifs_crypt_is_encrypted(dir); dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); @@ -455,6 +546,18 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) */ return 0; + if (encrypted) { + err = fscrypt_get_encryption_info(dir); + if (err && err != -ENOKEY) + return err; + + err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr); + if (err) + return err; + + fstr_real_len = fstr.len; + } + if (file->f_version == 0) { /* * The file was seek'ed, which means that @file->private_data @@ -476,12 +579,15 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) /* File positions 0 and 1 correspond to "." and ".." */ if (ctx->pos < 2) { ubifs_assert(!file->private_data); - if (!dir_emit_dots(file, ctx)) + if (!dir_emit_dots(file, ctx)) { + if (encrypted) + fscrypt_fname_free_buffer(&fstr); return 0; + } /* Find the first entry in TNC and save it */ lowest_dent_key(c, &key, dir->i_ino); - nm.name = NULL; + fname_len(&nm) = 0; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); @@ -499,7 +605,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) * Find the entry corresponding to @ctx->pos or the closest one. */ dent_key_init_hash(c, &key, dir->i_ino, ctx->pos); - nm.name = NULL; + fname_len(&nm) = 0; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); @@ -516,15 +622,33 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) ubifs_assert(le64_to_cpu(dent->ch.sqnum) > ubifs_inode(dir)->creat_sqnum); - nm.len = le16_to_cpu(dent->nlen); - if (!dir_emit(ctx, dent->name, nm.len, + fname_len(&nm) = le16_to_cpu(dent->nlen); + fname_name(&nm) = dent->name; + + if (encrypted) { + fstr.len = fstr_real_len; + + err = fscrypt_fname_disk_to_usr(dir, key_hash_flash(c, + &dent->key), + le32_to_cpu(dent->cookie), + &nm.disk_name, &fstr); + if (err) + goto out; + } else { + fstr.len = fname_len(&nm); + fstr.name = fname_name(&nm); + } + + if (!dir_emit(ctx, fstr.name, fstr.len, le64_to_cpu(dent->inum), - vfs_dent_type(dent->type))) + vfs_dent_type(dent->type))) { + if (encrypted) + fscrypt_fname_free_buffer(&fstr); return 0; + } /* Switch to the next entry */ key_read(c, &dent->key, &key); - nm.name = dent->name; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); @@ -541,6 +665,9 @@ out: kfree(file->private_data); file->private_data = NULL; + if (encrypted) + fscrypt_fname_free_buffer(&fstr); + if (err != -ENOENT) ubifs_err(c, "cannot find next direntry, error %d", err); else @@ -601,6 +728,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + struct fscrypt_name nm; /* * Budget request settings: new direntry, changing the target inode, @@ -613,13 +741,29 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(inode_is_locked(dir)); ubifs_assert(inode_is_locked(inode)); - err = dbg_check_synced_i_size(c, inode); + if (ubifs_crypt_is_encrypted(dir)) { + if (!fscrypt_has_permitted_context(dir, inode)) + return -EPERM; + + err = fscrypt_get_encryption_info(inode); + if (err) + return err; + + if (!fscrypt_has_encryption_key(inode)) + return -EPERM; + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); if (err) return err; + err = dbg_check_synced_i_size(c, inode); + if (err) + goto out_fname; + err = ubifs_budget_space(c, &req); if (err) - return err; + goto out_fname; lock_2_inodes(dir, inode); inc_nlink(inode); @@ -628,13 +772,14 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); d_instantiate(dentry, inode); + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -644,6 +789,8 @@ out_cancel: unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); iput(inode); +out_fname: + fscrypt_free_filename(&nm); return err; } @@ -652,10 +799,10 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) struct ubifs_info *c = dir->i_sb->s_fs_info; struct inode *inode = d_inode(dentry); struct ubifs_inode *dir_ui = ubifs_inode(dir); - int sz_change = CALC_DENT_SIZE(dentry->d_name.len); - int err, budgeted = 1; + int err, sz_change, budgeted = 1; struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; unsigned int saved_nlink = inode->i_nlink; + struct fscrypt_name nm; /* * Budget request settings: deletion direntry, deletion inode (+1 for @@ -667,16 +814,29 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); + + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err && err != -ENOKEY) + return err; + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm); + if (err) + return err; + + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + ubifs_assert(inode_is_locked(dir)); ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) - return err; + goto out_fname; err = ubifs_budget_space(c, &req); if (err) { if (err != -ENOSPC) - return err; + goto out_fname; budgeted = 0; } @@ -686,7 +846,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -698,6 +858,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -707,21 +868,23 @@ out_cancel: unlock_2_inodes(dir, inode); if (budgeted) ubifs_release_budget(c, &req); +out_fname: + fscrypt_free_filename(&nm); return err; } /** * check_dir_empty - check if a directory is empty or not. - * @c: UBIFS file-system description object * @dir: VFS inode object of the directory to check * * This function checks if directory @dir is empty. Returns zero if the * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes * in case of of errors. */ -static int check_dir_empty(struct ubifs_info *c, struct inode *dir) +int ubifs_check_dir_empty(struct inode *dir) { - struct qstr nm = { .name = NULL }; + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct fscrypt_name nm = { 0 }; struct ubifs_dent_node *dent; union ubifs_key key; int err; @@ -743,10 +906,10 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; struct inode *inode = d_inode(dentry); - int sz_change = CALC_DENT_SIZE(dentry->d_name.len); - int err, budgeted = 1; + int err, sz_change, budgeted = 1; struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + struct fscrypt_name nm; /* * Budget request settings: deletion direntry, deletion inode and @@ -758,14 +921,26 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino, dir->i_ino); ubifs_assert(inode_is_locked(dir)); ubifs_assert(inode_is_locked(inode)); - err = check_dir_empty(c, d_inode(dentry)); + err = ubifs_check_dir_empty(d_inode(dentry)); if (err) return err; + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err && err != -ENOKEY) + return err; + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm); + if (err) + return err; + + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + err = ubifs_budget_space(c, &req); if (err) { if (err != -ENOSPC) - return err; + goto out_fname; budgeted = 0; } @@ -776,7 +951,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -788,6 +963,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -798,6 +974,8 @@ out_cancel: unlock_2_inodes(dir, inode); if (budgeted) ubifs_release_budget(c, &req); +out_fname: + fscrypt_free_filename(&nm); return err; } @@ -806,8 +984,9 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; - int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, sz_change; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + struct fscrypt_name nm; /* * Budget request settings: new inode, new direntry and changing parent @@ -821,10 +1000,27 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) return err; + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) + goto out_budg; + + if (!fscrypt_has_encryption_key(dir)) { + err = -EPERM; + goto out_budg; + } + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + goto out_budg; + + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode); if (IS_ERR(inode)) { err = PTR_ERR(inode); - goto out_budg; + goto out_fname; } err = ubifs_init_security(dir, inode, &dentry->d_name); @@ -838,7 +1034,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) { ubifs_err(c, "cannot create directory, error %d", err); goto out_cancel; @@ -847,6 +1043,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ubifs_release_budget(c, &req); d_instantiate(dentry, inode); + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -857,6 +1054,8 @@ out_cancel: out_inode: make_bad_inode(inode); iput(inode); +out_fname: + fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); return err; @@ -870,11 +1069,12 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; union ubifs_dev_desc *dev = NULL; - int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int sz_change; int err, devlen = 0; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, .new_ino_d = ALIGN(devlen, 8), .dirtied_ino = 1 }; + struct fscrypt_name nm; /* * Budget request settings: new inode, new direntry and changing parent @@ -896,11 +1096,28 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, return err; } + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) + goto out_budg; + + if (!fscrypt_has_encryption_key(dir)) { + err = -EPERM; + goto out_budg; + } + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + goto out_budg; + + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + inode = ubifs_new_inode(c, dir, mode); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); - goto out_budg; + goto out_fname; } init_special_inode(inode, inode->i_mode, rdev); @@ -917,7 +1134,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -925,6 +1142,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, ubifs_release_budget(c, &req); insert_inode_hash(inode); d_instantiate(dentry, inode); + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -934,6 +1152,8 @@ out_cancel: out_inode: make_bad_inode(inode); iput(inode); +out_fname: + fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); return err; @@ -947,10 +1167,27 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, len = strlen(symname); - int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int sz_change = CALC_DENT_SIZE(len); + struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); + struct fscrypt_symlink_data *sd = NULL; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, .new_ino_d = ALIGN(len, 8), .dirtied_ino = 1 }; + struct fscrypt_name nm; + + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) + goto out_budg; + + if (!fscrypt_has_encryption_key(dir)) { + err = -EPERM; + goto out_budg; + } + + disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + + sizeof(struct fscrypt_symlink_data)); + } /* * Budget request settings: new inode, new direntry and changing parent @@ -960,36 +1197,77 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry, symname, dir->i_ino); - if (len > UBIFS_MAX_INO_DATA) + if (disk_link.len > UBIFS_MAX_INO_DATA) return -ENAMETOOLONG; err = ubifs_budget_space(c, &req); if (err) return err; + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + goto out_budg; + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) { err = PTR_ERR(inode); - goto out_budg; + goto out_fname; } ui = ubifs_inode(inode); - ui->data = kmalloc(len + 1, GFP_NOFS); + ui->data = kmalloc(disk_link.len, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_inode; } - memcpy(ui->data, symname, len); - ((char *)ui->data)[len] = '\0'; - inode->i_link = ui->data; + if (ubifs_crypt_is_encrypted(dir)) { + struct qstr istr = QSTR_INIT(symname, len); + struct fscrypt_str ostr; + + sd = kzalloc(disk_link.len, GFP_NOFS); + if (!sd) { + err = -ENOMEM; + goto out_inode; + } + + err = fscrypt_get_encryption_info(inode); + if (err) { + kfree(sd); + goto out_inode; + } + + if (!fscrypt_has_encryption_key(inode)) { + kfree(sd); + err = -EPERM; + goto out_inode; + } + + ostr.name = sd->encrypted_path; + ostr.len = disk_link.len; + + err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); + if (err) { + kfree(sd); + goto out_inode; + } + + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *)sd; + } else { + inode->i_link = ui->data; + } + + memcpy(ui->data, disk_link.name, disk_link.len); + ((char *)ui->data)[disk_link.len - 1] = '\0'; + /* * The terminating zero byte is not written to the flash media and it * is put just to make later in-memory string processing simpler. Thus, * data length is @len, not @len + %1. */ - ui->data_len = len; - inode->i_size = ubifs_inode(inode)->ui_size = len; + ui->data_len = disk_link.len - 1; + inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1; err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) @@ -999,7 +1277,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1007,6 +1285,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, ubifs_release_budget(c, &req); insert_inode_hash(inode); d_instantiate(dentry, inode); + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -1016,6 +1295,8 @@ out_cancel: out_inode: make_bad_inode(inode); iput(inode); +out_fname: + fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); return err; @@ -1078,15 +1359,14 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_inode *whiteout_ui = NULL; int err, release, sync = 0, move = (new_dir != old_dir); int is_dir = S_ISDIR(old_inode->i_mode); - int unlink = !!new_inode; - int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len); - int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len); + int unlink = !!new_inode, new_sz, old_sz; struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; unsigned int uninitialized_var(saved_nlink); + struct fscrypt_name old_nm, new_nm; if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -1107,17 +1387,41 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlink) ubifs_assert(inode_is_locked(new_inode)); + if (old_dir != new_dir) { + if (ubifs_crypt_is_encrypted(new_dir) && + !fscrypt_has_permitted_context(new_dir, old_inode)) + return -EPERM; + } + if (unlink && is_dir) { - err = check_dir_empty(c, new_inode); + err = ubifs_check_dir_empty(new_inode); if (err) return err; } - err = ubifs_budget_space(c, &req); + err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_nm); if (err) return err; + + err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_nm); + if (err) { + fscrypt_free_filename(&old_nm); + return err; + } + + new_sz = CALC_DENT_SIZE(fname_len(&new_nm)); + old_sz = CALC_DENT_SIZE(fname_len(&old_nm)); + + err = ubifs_budget_space(c, &req); + if (err) { + fscrypt_free_filename(&old_nm); + fscrypt_free_filename(&new_nm); + return err; + } err = ubifs_budget_space(c, &ino_req); if (err) { + fscrypt_free_filename(&old_nm); + fscrypt_free_filename(&new_nm); ubifs_release_budget(c, &req); return err; } @@ -1239,8 +1543,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, iput(whiteout); } - err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, whiteout, - sync); + err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, + new_inode, &new_nm, whiteout, sync); if (err) goto out_cancel; @@ -1256,6 +1560,9 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + + fscrypt_free_filename(&old_nm); + fscrypt_free_filename(&new_nm); return err; out_cancel: @@ -1284,6 +1591,8 @@ out_cancel: unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &ino_req); ubifs_release_budget(c, &req); + fscrypt_free_filename(&old_nm); + fscrypt_free_filename(&new_nm); return err; } @@ -1298,9 +1607,27 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, struct inode *snd_inode = d_inode(new_dentry); struct timespec time; int err; + struct fscrypt_name fst_nm, snd_nm; ubifs_assert(fst_inode && snd_inode); + if ((ubifs_crypt_is_encrypted(old_dir) || + ubifs_crypt_is_encrypted(new_dir)) && + (old_dir != new_dir) && + (!fscrypt_has_permitted_context(new_dir, fst_inode) || + !fscrypt_has_permitted_context(old_dir, snd_inode))) + return -EPERM; + + err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm); + if (err) + return err; + + err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &snd_nm); + if (err) { + fscrypt_free_filename(&fst_nm); + return err; + } + lock_4_inodes(old_dir, new_dir, NULL, NULL); time = ubifs_current_time(old_dir); @@ -1320,12 +1647,14 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, } } - err = ubifs_jnl_xrename(c, old_dir, old_dentry, new_dir, new_dentry, - sync); + err = ubifs_jnl_xrename(c, old_dir, fst_inode, &fst_nm, new_dir, + snd_inode, &snd_nm, sync); unlock_4_inodes(old_dir, new_dir, NULL, NULL); ubifs_release_budget(c, &req); + fscrypt_free_filename(&fst_nm); + fscrypt_free_filename(&snd_nm); return err; } @@ -1384,6 +1713,14 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int ubifs_dir_open(struct inode *dir, struct file *file) +{ + if (ubifs_crypt_is_encrypted(dir)) + return fscrypt_get_encryption_info(dir) ? -EACCES : 0; + + return 0; +} + const struct inode_operations ubifs_dir_inode_operations = { .lookup = ubifs_lookup, .create = ubifs_create, @@ -1410,6 +1747,7 @@ const struct file_operations ubifs_dir_operations = { .iterate_shared = ubifs_readdir, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, + .open = ubifs_dir_open, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index b4fbeefba246..aa0625f4f642 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -78,6 +78,13 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, goto dump; dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + + if (ubifs_crypt_is_encrypted(inode)) { + err = ubifs_decrypt(inode, dn, &dlen, block); + if (err) + goto dump; + } + out_len = UBIFS_BLOCK_SIZE; err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, le16_to_cpu(dn->compr_type)); @@ -650,6 +657,13 @@ static int populate_page(struct ubifs_info *c, struct page *page, dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; out_len = UBIFS_BLOCK_SIZE; + + if (ubifs_crypt_is_encrypted(inode)) { + err = ubifs_decrypt(inode, dn, &dlen, page_block); + if (err) + goto out_err; + } + err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, le16_to_cpu(dn->compr_type)); if (err || len != out_len) @@ -1594,6 +1608,15 @@ static const struct vm_operations_struct ubifs_file_vm_ops = { static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) { int err; + struct inode *inode = file->f_mapping->host; + + if (ubifs_crypt_is_encrypted(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err) + return -EACCES; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } err = generic_file_mmap(file, vma); if (err) @@ -1605,6 +1628,88 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int ubifs_file_open(struct inode *inode, struct file *filp) +{ + int ret; + struct dentry *dir; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + if (ubifs_crypt_is_encrypted(inode)) { + ret = fscrypt_get_encryption_info(inode); + if (ret) + return -EACCES; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + + dir = dget_parent(file_dentry(filp)); + if (ubifs_crypt_is_encrypted(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + ubifs_err(c, "Inconsistent encryption contexts: %lu/%lu", + (unsigned long) d_inode(dir)->i_ino, + (unsigned long) inode->i_ino); + dput(dir); + ubifs_ro_mode(c, -EPERM); + return -EPERM; + } + dput(dir); + + return 0; +} + +static const char *ubifs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + int err; + struct fscrypt_symlink_data *sd; + struct ubifs_inode *ui = ubifs_inode(inode); + struct fscrypt_str cstr; + struct fscrypt_str pstr; + + if (!ubifs_crypt_is_encrypted(inode)) + return ui->data; + + if (!dentry) + return ERR_PTR(-ECHILD); + + err = fscrypt_get_encryption_info(inode); + if (err) + return ERR_PTR(err); + + sd = (struct fscrypt_symlink_data *)ui->data; + cstr.name = sd->encrypted_path; + cstr.len = le16_to_cpu(sd->len); + + if (cstr.len == 0) + return ERR_PTR(-ENOENT); + + if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > ui->data_len) + return ERR_PTR(-EIO); + + err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + if (err) + return ERR_PTR(err); + + err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); + if (err) { + fscrypt_fname_free_buffer(&pstr); + return ERR_PTR(err); + } + + pstr.name[pstr.len] = '\0'; + + // XXX this probably won't happen anymore... + if (pstr.name[0] == '\0') { + fscrypt_fname_free_buffer(&pstr); + return ERR_PTR(-ENOENT); + } + + set_delayed_call(done, kfree_link, pstr.name); + return pstr.name; +} + + const struct address_space_operations ubifs_file_address_operations = { .readpage = ubifs_readpage, .writepage = ubifs_writepage, @@ -1629,7 +1734,7 @@ const struct inode_operations ubifs_file_inode_operations = { const struct inode_operations ubifs_symlink_inode_operations = { .readlink = generic_readlink, - .get_link = simple_get_link, + .get_link = ubifs_get_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, .listxattr = ubifs_listxattr, @@ -1647,6 +1752,7 @@ const struct file_operations ubifs_file_operations = { .unlocked_ioctl = ubifs_ioctl, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .open = ubifs_file_open, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index e845c64b6ce1..7b35e3d6cde7 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -846,10 +846,6 @@ int ubifs_gc_start_commit(struct ubifs_info *c) */ while (1) { lp = ubifs_fast_find_freeable(c); - if (IS_ERR(lp)) { - err = PTR_ERR(lp); - goto out; - } if (!lp) break; ubifs_assert(!(lp->flags & LPROPS_TAKEN)); diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 97be41215332..3be28900bf37 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -452,16 +452,22 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) */ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) { + ktime_t softlimit = ms_to_ktime(dirty_writeback_interval * 10); + unsigned long long delta = dirty_writeback_interval; + + /* centi to milli, milli to nano, then 10% */ + delta *= 10ULL * NSEC_PER_MSEC / 10ULL; + ubifs_assert(!hrtimer_active(&wbuf->timer)); + ubifs_assert(delta <= ULONG_MAX); if (wbuf->no_timer) return; dbg_io("set timer for jhead %s, %llu-%llu millisecs", dbg_jhead(wbuf->jhead), - div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), - div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, - USEC_PER_SEC)); - hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta, + div_u64(ktime_to_ns(softlimit), USEC_PER_SEC), + div_u64(ktime_to_ns(softlimit) + delta, USEC_PER_SEC)); + hrtimer_start_range_ns(&wbuf->timer, softlimit, delta, HRTIMER_MODE_REL); } @@ -1059,10 +1065,6 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); wbuf->timer.function = wbuf_timer_callback_nolock; - wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0); - wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT; - wbuf->delta *= 1000000000ULL; - ubifs_assert(wbuf->delta <= ULONG_MAX); return 0; } diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 3c7b29de0ca7..78d713644df3 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -181,6 +181,26 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) mnt_drop_write_file(file); return err; } + case FS_IOC_SET_ENCRYPTION_POLICY: { +#ifdef CONFIG_UBIFS_FS_ENCRYPTION + struct ubifs_info *c = inode->i_sb->s_fs_info; + + err = ubifs_enable_encryption(c); + if (err) + return err; + + return fscrypt_ioctl_set_policy(file, (const void __user *)arg); +#else + return -EOPNOTSUPP; +#endif + } + case FS_IOC_GET_ENCRYPTION_POLICY: { +#ifdef CONFIG_UBIFS_FS_ENCRYPTION + return fscrypt_ioctl_get_policy(file, (void __user *)arg); +#else + return -EOPNOTSUPP; +#endif + } default: return -ENOTTY; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 91bc76dc559e..a459211a1c21 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -78,16 +78,6 @@ static inline void zero_ino_node_unused(struct ubifs_ino_node *ino) static inline void zero_dent_node_unused(struct ubifs_dent_node *dent) { dent->padding1 = 0; - memset(dent->padding2, 0, 4); -} - -/** - * zero_data_node_unused - zero out unused fields of an on-flash data node. - * @data: the data node to zero out - */ -static inline void zero_data_node_unused(struct ubifs_data_node *data) -{ - memset(data->padding, 0, 2); } /** @@ -511,6 +501,14 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) ui->dirty = 0; } +static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) +{ + if (c->double_hash) + dent->cookie = prandom_u32(); + else + dent->cookie = 0; +} + /** * ubifs_jnl_update - update inode. * @c: UBIFS file-system description object @@ -539,7 +537,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) * success. In case of failure, a negative error code is returned. */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, - const struct qstr *nm, const struct inode *inode, + const struct fscrypt_name *nm, const struct inode *inode, int deletion, int xent) { int err, dlen, ilen, len, lnum, ino_offs, dent_offs; @@ -551,11 +549,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, struct ubifs_ino_node *ino; union ubifs_key dent_key, ino_key; - dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", - inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); + //dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", + // inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); - dlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + dlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1; ilen = UBIFS_INO_NODE_SZ; /* @@ -596,9 +594,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, key_write(c, &dent_key, dent->key); dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino); dent->type = get_dent_type(inode->i_mode); - dent->nlen = cpu_to_le16(nm->len); - memcpy(dent->name, nm->name, nm->len); - dent->name[nm->len] = '\0'; + dent->nlen = cpu_to_le16(fname_len(nm)); + memcpy(dent->name, fname_name(nm), fname_len(nm)); + dent->name[fname_len(nm)] = '\0'; + set_dent_cookie(c, dent); + zero_dent_node_unused(dent); ubifs_prep_grp_node(c, dent, dlen, 0); @@ -697,14 +697,18 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len) { struct ubifs_data_node *data; - int err, lnum, offs, compr_type, out_len; + int err, lnum, offs, compr_type, out_len, compr_len; int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; struct ubifs_inode *ui = ubifs_inode(inode); + bool encrypted = ubifs_crypt_is_encrypted(inode); dbg_jnlk(key, "ino %lu, blk %u, len %d, key ", (unsigned long)key_inum(c, key), key_block(c, key), len); ubifs_assert(len <= UBIFS_BLOCK_SIZE); + if (encrypted) + dlen += UBIFS_CIPHER_BLOCK_SIZE; + data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); if (!data) { /* @@ -722,7 +726,6 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, data->ch.node_type = UBIFS_DATA_NODE; key_write(c, key, &data->key); data->size = cpu_to_le32(len); - zero_data_node_unused(data); if (!(ui->flags & UBIFS_COMPR_FL)) /* Compression is disabled for this inode */ @@ -730,9 +733,18 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, else compr_type = ui->compr_type; - out_len = dlen - UBIFS_DATA_NODE_SZ; - ubifs_compress(c, buf, len, &data->data, &out_len, &compr_type); - ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + out_len = compr_len = dlen - UBIFS_DATA_NODE_SZ; + ubifs_compress(c, buf, len, &data->data, &compr_len, &compr_type); + ubifs_assert(compr_len <= UBIFS_BLOCK_SIZE); + + if (encrypted) { + err = ubifs_encrypt(inode, data, compr_len, &out_len, key_block(c, key)); + if (err) + goto out_free; + + } else { + data->compr_size = 0; + } dlen = UBIFS_DATA_NODE_SZ + out_len; data->compr_type = cpu_to_le16(compr_type); @@ -911,9 +923,11 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) * ubifs_jnl_xrename - cross rename two directory entries. * @c: UBIFS file-system description object * @fst_dir: parent inode of 1st directory entry to exchange - * @fst_dentry: 1st directory entry to exchange + * @fst_inode: 1st inode to exchange + * @fst_nm: name of 1st inode to exchange * @snd_dir: parent inode of 2nd directory entry to exchange - * @snd_dentry: 2nd directory entry to exchange + * @snd_inode: 2nd inode to exchange + * @snd_nm: name of 2nd inode to exchange * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the cross rename operation which may involve @@ -922,29 +936,29 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) * returned. */ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, - const struct dentry *fst_dentry, + const struct inode *fst_inode, + const struct fscrypt_name *fst_nm, const struct inode *snd_dir, - const struct dentry *snd_dentry, int sync) + const struct inode *snd_inode, + const struct fscrypt_name *snd_nm, int sync) { union ubifs_key key; struct ubifs_dent_node *dent1, *dent2; int err, dlen1, dlen2, lnum, offs, len, plen = UBIFS_INO_NODE_SZ; int aligned_dlen1, aligned_dlen2; int twoparents = (fst_dir != snd_dir); - const struct inode *fst_inode = d_inode(fst_dentry); - const struct inode *snd_inode = d_inode(snd_dentry); void *p; - dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu", - fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino); + //dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu", + // fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino); ubifs_assert(ubifs_inode(fst_dir)->data_len == 0); ubifs_assert(ubifs_inode(snd_dir)->data_len == 0); ubifs_assert(mutex_is_locked(&ubifs_inode(fst_dir)->ui_mutex)); ubifs_assert(mutex_is_locked(&ubifs_inode(snd_dir)->ui_mutex)); - dlen1 = UBIFS_DENT_NODE_SZ + snd_dentry->d_name.len + 1; - dlen2 = UBIFS_DENT_NODE_SZ + fst_dentry->d_name.len + 1; + dlen1 = UBIFS_DENT_NODE_SZ + fname_len(snd_nm) + 1; + dlen2 = UBIFS_DENT_NODE_SZ + fname_len(fst_nm) + 1; aligned_dlen1 = ALIGN(dlen1, 8); aligned_dlen2 = ALIGN(dlen2, 8); @@ -963,24 +977,24 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, /* Make new dent for 1st entry */ dent1->ch.node_type = UBIFS_DENT_NODE; - dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, &snd_dentry->d_name); + dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, snd_nm); dent1->inum = cpu_to_le64(fst_inode->i_ino); dent1->type = get_dent_type(fst_inode->i_mode); - dent1->nlen = cpu_to_le16(snd_dentry->d_name.len); - memcpy(dent1->name, snd_dentry->d_name.name, snd_dentry->d_name.len); - dent1->name[snd_dentry->d_name.len] = '\0'; + dent1->nlen = cpu_to_le16(fname_len(snd_nm)); + memcpy(dent1->name, fname_name(snd_nm), fname_len(snd_nm)); + dent1->name[fname_len(snd_nm)] = '\0'; zero_dent_node_unused(dent1); ubifs_prep_grp_node(c, dent1, dlen1, 0); /* Make new dent for 2nd entry */ dent2 = (void *)dent1 + aligned_dlen1; dent2->ch.node_type = UBIFS_DENT_NODE; - dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, &fst_dentry->d_name); + dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, fst_nm); dent2->inum = cpu_to_le64(snd_inode->i_ino); dent2->type = get_dent_type(snd_inode->i_mode); - dent2->nlen = cpu_to_le16(fst_dentry->d_name.len); - memcpy(dent2->name, fst_dentry->d_name.name, fst_dentry->d_name.len); - dent2->name[fst_dentry->d_name.len] = '\0'; + dent2->nlen = cpu_to_le16(fname_len(fst_nm)); + memcpy(dent2->name, fname_name(fst_nm), fname_len(fst_nm)); + dent2->name[fname_len(fst_nm)] = '\0'; zero_dent_node_unused(dent2); ubifs_prep_grp_node(c, dent2, dlen2, 0); @@ -1004,14 +1018,14 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, } release_head(c, BASEHD); - dent_key_init(c, &key, snd_dir->i_ino, &snd_dentry->d_name); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &snd_dentry->d_name); + dent_key_init(c, &key, snd_dir->i_ino, snd_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, snd_nm); if (err) goto out_ro; offs += aligned_dlen1; - dent_key_init(c, &key, fst_dir->i_ino, &fst_dentry->d_name); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &fst_dentry->d_name); + dent_key_init(c, &key, fst_dir->i_ino, fst_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, fst_nm); if (err) goto out_ro; @@ -1063,31 +1077,31 @@ out_free: * returned. */ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, - const struct dentry *old_dentry, + const struct inode *old_inode, + const struct fscrypt_name *old_nm, const struct inode *new_dir, - const struct dentry *new_dentry, + const struct inode *new_inode, + const struct fscrypt_name *new_nm, const struct inode *whiteout, int sync) { void *p; union ubifs_key key; struct ubifs_dent_node *dent, *dent2; int err, dlen1, dlen2, ilen, lnum, offs, len; - const struct inode *old_inode = d_inode(old_dentry); - const struct inode *new_inode = d_inode(new_dentry); int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); struct ubifs_inode *uninitialized_var(new_ui); - dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu", - old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino); + //dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu", + // old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino); ubifs_assert(ubifs_inode(old_dir)->data_len == 0); ubifs_assert(ubifs_inode(new_dir)->data_len == 0); ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex)); ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex)); - dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1; - dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1; + dlen1 = UBIFS_DENT_NODE_SZ + fname_len(new_nm) + 1; + dlen2 = UBIFS_DENT_NODE_SZ + fname_len(old_nm) + 1; if (new_inode) { new_ui = ubifs_inode(new_inode); ubifs_assert(mutex_is_locked(&new_ui->ui_mutex)); @@ -1113,19 +1127,19 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, /* Make new dent */ dent->ch.node_type = UBIFS_DENT_NODE; - dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name); + dent_key_init_flash(c, &dent->key, new_dir->i_ino, new_nm); dent->inum = cpu_to_le64(old_inode->i_ino); dent->type = get_dent_type(old_inode->i_mode); - dent->nlen = cpu_to_le16(new_dentry->d_name.len); - memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len); - dent->name[new_dentry->d_name.len] = '\0'; + dent->nlen = cpu_to_le16(fname_len(new_nm)); + memcpy(dent->name, fname_name(new_nm), fname_len(new_nm)); + dent->name[fname_len(new_nm)] = '\0'; + set_dent_cookie(c, dent); zero_dent_node_unused(dent); ubifs_prep_grp_node(c, dent, dlen1, 0); dent2 = (void *)dent + aligned_dlen1; dent2->ch.node_type = UBIFS_DENT_NODE; - dent_key_init_flash(c, &dent2->key, old_dir->i_ino, - &old_dentry->d_name); + dent_key_init_flash(c, &dent2->key, old_dir->i_ino, old_nm); if (whiteout) { dent2->inum = cpu_to_le64(whiteout->i_ino); @@ -1135,9 +1149,10 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, dent2->inum = 0; dent2->type = DT_UNKNOWN; } - dent2->nlen = cpu_to_le16(old_dentry->d_name.len); - memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len); - dent2->name[old_dentry->d_name.len] = '\0'; + dent2->nlen = cpu_to_le16(fname_len(old_nm)); + memcpy(dent2->name, fname_name(old_nm), fname_len(old_nm)); + dent2->name[fname_len(old_nm)] = '\0'; + set_dent_cookie(c, dent2); zero_dent_node_unused(dent2); ubifs_prep_grp_node(c, dent2, dlen2, 0); @@ -1178,15 +1193,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, } release_head(c, BASEHD); - dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name); + dent_key_init(c, &key, new_dir->i_ino, new_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, new_nm); if (err) goto out_ro; offs += aligned_dlen1; if (whiteout) { - dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &old_dentry->d_name); + dent_key_init(c, &key, old_dir->i_ino, old_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, old_nm); if (err) goto out_ro; @@ -1196,8 +1211,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (err) goto out_ro; - dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); - err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name); + dent_key_init(c, &key, old_dir->i_ino, old_nm); + err = ubifs_tnc_remove_nm(c, &key, old_nm); if (err) goto out_ro; } @@ -1251,31 +1266,55 @@ out_free: } /** - * recomp_data_node - re-compress a truncated data node. + * truncate_data_node - re-compress/encrypt a truncated data node. + * @c: UBIFS file-system description object + * @inode: inode which referes to the data node + * @block: data block number * @dn: data node to re-compress * @new_len: new length * * This function is used when an inode is truncated and the last data node of - * the inode has to be re-compressed and re-written. + * the inode has to be re-compressed/encrypted and re-written. */ -static int recomp_data_node(const struct ubifs_info *c, - struct ubifs_data_node *dn, int *new_len) +static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, + unsigned int block, struct ubifs_data_node *dn, + int *new_len) { void *buf; - int err, len, compr_type, out_len; + int err, dlen, compr_type, out_len, old_dlen; out_len = le32_to_cpu(dn->size); buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; - len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); - err = ubifs_decompress(c, &dn->data, len, buf, &out_len, compr_type); - if (err) - goto out; - ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type); + if (ubifs_crypt_is_encrypted(inode)) { + err = ubifs_decrypt(inode, dn, &dlen, block); + if (err) + goto out; + } + + if (compr_type != UBIFS_COMPR_NONE) { + err = ubifs_decompress(c, &dn->data, dlen, buf, &out_len, compr_type); + if (err) + goto out; + + ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type); + } + + if (ubifs_crypt_is_encrypted(inode)) { + err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + if (err) + goto out; + + out_len = old_dlen; + } else { + dn->compr_size = 0; + } + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); dn->compr_type = cpu_to_le16(compr_type); dn->size = cpu_to_le32(*new_len); @@ -1347,17 +1386,9 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (le32_to_cpu(dn->size) <= dlen) dlen = 0; /* Nothing to do */ else { - int compr_type = le16_to_cpu(dn->compr_type); - - if (compr_type != UBIFS_COMPR_NONE) { - err = recomp_data_node(c, dn, &dlen); - if (err) - goto out_free; - } else { - dn->size = cpu_to_le32(dlen); - dlen += UBIFS_DATA_NODE_SZ; - } - zero_data_node_unused(dn); + err = truncate_data_node(c, inode, blk, dn, &dlen); + if (err) + goto out_free; } } } @@ -1442,7 +1473,8 @@ out_free: * error code in case of failure. */ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, - const struct inode *inode, const struct qstr *nm) + const struct inode *inode, + const struct fscrypt_name *nm) { int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen; struct ubifs_dent_node *xent; @@ -1451,9 +1483,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, int sync = IS_DIRSYNC(host); struct ubifs_inode *host_ui = ubifs_inode(host); - dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d", - host->i_ino, inode->i_ino, nm->name, - ubifs_inode(inode)->data_len); + //dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d", + // host->i_ino, inode->i_ino, nm->name, + // ubifs_inode(inode)->data_len); ubifs_assert(inode->i_nlink == 0); ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); @@ -1461,7 +1493,7 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, * Since we are deleting the inode, we do not bother to attach any data * to it and assume its length is %UBIFS_INO_NODE_SZ. */ - xlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + xlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1; aligned_xlen = ALIGN(xlen, 8); hlen = host_ui->data_len + UBIFS_INO_NODE_SZ; len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8); @@ -1482,9 +1514,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, key_write(c, &xent_key, xent->key); xent->inum = 0; xent->type = get_dent_type(inode->i_mode); - xent->nlen = cpu_to_le16(nm->len); - memcpy(xent->name, nm->name, nm->len); - xent->name[nm->len] = '\0'; + xent->nlen = cpu_to_le16(fname_len(nm)); + memcpy(xent->name, fname_name(nm), fname_len(nm)); + xent->name[fname_len(nm)] = '\0'; zero_dent_node_unused(xent); ubifs_prep_grp_node(c, xent, xlen, 0); diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index c0a95e393347..7547be512db2 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -69,7 +69,7 @@ static inline uint32_t key_r5_hash(const char *s, int len) uint32_t a = 0; const signed char *str = (const signed char *)s; - while (*str) { + while (len--) { a += *str << 4; a += *str >> 4; a *= 11; @@ -153,13 +153,13 @@ static inline void highest_ino_key(const struct ubifs_info *c, * @c: UBIFS file-system description object * @key: key to initialize * @inum: parent inode number - * @nm: direntry name and length + * @nm: direntry name and length. Not a string when encrypted! */ static inline void dent_key_init(const struct ubifs_info *c, union ubifs_key *key, ino_t inum, - const struct qstr *nm) + const struct fscrypt_name *nm) { - uint32_t hash = c->key_hash(nm->name, nm->len); + uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm)); ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); key->u32[0] = inum; @@ -191,10 +191,11 @@ static inline void dent_key_init_hash(const struct ubifs_info *c, * @nm: direntry name and length */ static inline void dent_key_init_flash(const struct ubifs_info *c, void *k, - ino_t inum, const struct qstr *nm) + ino_t inum, + const struct fscrypt_name *nm) { union ubifs_key *key = k; - uint32_t hash = c->key_hash(nm->name, nm->len); + uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm)); ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); key->j32[0] = cpu_to_le32(inum); @@ -225,9 +226,9 @@ static inline void lowest_dent_key(const struct ubifs_info *c, */ static inline void xent_key_init(const struct ubifs_info *c, union ubifs_key *key, ino_t inum, - const struct qstr *nm) + const struct fscrypt_name *nm) { - uint32_t hash = c->key_hash(nm->name, nm->len); + uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm)); ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); key->u32[0] = inum; @@ -242,10 +243,10 @@ static inline void xent_key_init(const struct ubifs_info *c, * @nm: extended attribute entry name and length */ static inline void xent_key_init_flash(const struct ubifs_info *c, void *k, - ino_t inum, const struct qstr *nm) + ino_t inum, const struct fscrypt_name *nm) { union ubifs_key *key = k; - uint32_t hash = c->key_hash(nm->name, nm->len); + uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm)); ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); key->j32[0] = cpu_to_le32(inum); diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index fb0f44cd1e28..ae5c02f22f3e 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -61,7 +61,7 @@ struct replay_entry { struct list_head list; union ubifs_key key; union { - struct qstr nm; + struct fscrypt_name nm; struct { loff_t old_size; loff_t new_size; @@ -327,7 +327,7 @@ static void destroy_replay_list(struct ubifs_info *c) list_for_each_entry_safe(r, tmp, &c->replay_list, list) { if (is_hash_key(c, &r->key)) - kfree(r->nm.name); + kfree(fname_name(&r->nm)); list_del(&r->list); kfree(r); } @@ -430,10 +430,10 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, r->deletion = !!deletion; r->sqnum = sqnum; key_copy(c, key, &r->key); - r->nm.len = nlen; + fname_len(&r->nm) = nlen; memcpy(nbuf, name, nlen); nbuf[nlen] = '\0'; - r->nm.name = nbuf; + fname_name(&r->nm) = nbuf; list_add_tail(&r->list, &c->replay_list); return 0; @@ -456,7 +456,7 @@ int ubifs_validate_entry(struct ubifs_info *c, if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 || dent->type >= UBIFS_ITYPES_CNT || nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 || - strnlen(dent->name, nlen) != nlen || + (key_type == UBIFS_XENT_KEY && strnlen(dent->name, nlen) != nlen) || le64_to_cpu(dent->inum) > MAX_INUM) { ubifs_err(c, "bad %s node", key_type == UBIFS_DENT_KEY ? "directory entry" : "extended attribute entry"); diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 3cbb904a6d7d..7f1ead29e727 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -163,6 +163,7 @@ static int create_default_filesystem(struct ubifs_info *c) tmp64 = (long long)max_buds * c->leb_size; if (big_lpt) sup_flags |= UBIFS_FLG_BIGLPT; + sup_flags |= UBIFS_FLG_DOUBLE_HASH; sup->ch.node_type = UBIFS_SB_NODE; sup->key_hash = UBIFS_KEY_HASH_R5; @@ -465,6 +466,16 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) goto failed; } + if (!c->double_hash && c->fmt_version >= 5) { + err = 16; + goto failed; + } + + if (c->encrypted && c->fmt_version < 5) { + err = 17; + goto failed; + } + return 0; failed: @@ -620,6 +631,24 @@ int ubifs_read_superblock(struct ubifs_info *c) memcpy(&c->uuid, &sup->uuid, 16); c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP); + c->double_hash = !!(sup_flags & UBIFS_FLG_DOUBLE_HASH); + c->encrypted = !!(sup_flags & UBIFS_FLG_ENCRYPTION); + + if ((sup_flags & ~UBIFS_FLG_MASK) != 0) { + ubifs_err(c, "Unknown feature flags found: %#x", + sup_flags & ~UBIFS_FLG_MASK); + err = -EINVAL; + goto out; + } + +#ifndef CONFIG_UBIFS_FS_ENCRYPTION + if (c->encrypted) { + ubifs_err(c, "file system contains encrypted files but UBIFS" + " was built without crypto support."); + err = -EINVAL; + goto out; + } +#endif /* Automatically increase file system size to the maximum size */ c->old_leb_cnt = c->leb_cnt; @@ -807,3 +836,33 @@ int ubifs_fixup_free_space(struct ubifs_info *c) ubifs_msg(c, "free space fixup complete"); return err; } + +int ubifs_enable_encryption(struct ubifs_info *c) +{ + int err; + struct ubifs_sb_node *sup; + + if (c->encrypted) + return 0; + + if (c->ro_mount || c->ro_media) + return -EROFS; + + if (c->fmt_version < 5) { + ubifs_err(c, "on-flash format version 5 is needed for encryption"); + return -EINVAL; + } + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + sup->flags |= cpu_to_le32(UBIFS_FLG_ENCRYPTION); + + err = ubifs_write_sb_node(c, sup); + if (!err) + c->encrypted = 1; + kfree(sup); + + return err; +} diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 4ec051089186..e08aa04fc835 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -198,7 +198,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; - inode->i_link = ui->data; break; case S_IFBLK: case S_IFCHR: @@ -380,6 +379,9 @@ out: } done: clear_inode(inode); +#ifdef CONFIG_UBIFS_FS_ENCRYPTION + fscrypt_put_encryption_info(inode, NULL); +#endif } static void ubifs_dirty_inode(struct inode *inode, int flags) @@ -1207,7 +1209,8 @@ static int mount_ubifs(struct ubifs_info *c) bu_init(c); if (!c->ro_mount) { - c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ + UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) goto out_free; @@ -1620,7 +1623,8 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; } - c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ + UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) { err = -ENOMEM; goto out; @@ -1995,6 +1999,12 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) return c; } +#ifndef CONFIG_UBIFS_FS_ENCRYPTION +struct fscrypt_operations ubifs_crypt_operations = { + .is_encrypted = __ubifs_crypt_is_encrypted, +}; +#endif + static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { struct ubifs_info *c = sb->s_fs_info; @@ -2041,6 +2051,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; sb->s_xattr = ubifs_xattr_handlers; + sb->s_cop = &ubifs_crypt_operations; mutex_lock(&c->umount_mutex); err = mount_ubifs(c); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index fa9a20cc60d6..74ae2de949df 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -378,7 +378,7 @@ static void lnc_free(struct ubifs_zbranch *zbr) } /** - * tnc_read_node_nm - read a "hashed" leaf node. + * tnc_read_hashed_node - read a "hashed" leaf node. * @c: UBIFS file-system description object * @zbr: key and position of the node * @node: node is returned here @@ -388,8 +388,8 @@ static void lnc_free(struct ubifs_zbranch *zbr) * added to LNC. Returns zero in case of success or a negative negative error * code in case of failure. */ -static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, - void *node) +static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) { int err; @@ -519,7 +519,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, * of failure, a negative error code is returned. */ static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, - const struct qstr *nm) + const struct fscrypt_name *nm) { struct ubifs_dent_node *dent; int nlen, err; @@ -542,11 +542,11 @@ static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, dent = zbr->leaf; nlen = le16_to_cpu(dent->nlen); - err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm))); if (err == 0) { - if (nlen == nm->len) + if (nlen == fname_len(nm)) return NAME_MATCHES; - else if (nlen < nm->len) + else if (nlen < fname_len(nm)) return NAME_LESS; else return NAME_GREATER; @@ -689,7 +689,7 @@ static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n) */ static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n, - const struct qstr *nm) + const struct fscrypt_name *nm) { int err; @@ -807,7 +807,7 @@ static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key, */ static int fallible_matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, - const struct qstr *nm) + const struct fscrypt_name *nm) { struct ubifs_dent_node *dent; int nlen, err; @@ -835,11 +835,11 @@ static int fallible_matches_name(struct ubifs_info *c, dent = zbr->leaf; nlen = le16_to_cpu(dent->nlen); - err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm))); if (err == 0) { - if (nlen == nm->len) + if (nlen == fname_len(nm)) return NAME_MATCHES; - else if (nlen < nm->len) + else if (nlen < fname_len(nm)) return NAME_LESS; else return NAME_GREATER; @@ -878,7 +878,8 @@ out_free: static int fallible_resolve_collision(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n, - const struct qstr *nm, int adding) + const struct fscrypt_name *nm, + int adding) { struct ubifs_znode *o_znode = NULL, *znode = *zn; int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n; @@ -1453,7 +1454,7 @@ again: * In this case the leaf node cache gets used, so we pass the * address of the zbranch and keep the mutex locked */ - err = tnc_read_node_nm(c, zt, node); + err = tnc_read_hashed_node(c, zt, node); goto out; } if (safely) { @@ -1782,19 +1783,19 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) * @node: the node is returned here * @nm: node name * - * This function look up and reads a node which contains name hash in the key. + * This function looks up and reads a node which contains name hash in the key. * Since the hash may have collisions, there may be many nodes with the same * key, so we have to sequentially look to all of them until the needed one is * found. This function returns zero in case of success, %-ENOENT if the node * was not found, and a negative error code in case of failure. */ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, - void *node, const struct qstr *nm) + void *node, const struct fscrypt_name *nm) { int found, n, err; struct ubifs_znode *znode; - dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name); + //dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name); mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1816,7 +1817,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, goto out_unlock; } - err = tnc_read_node_nm(c, &znode->zbranch[n], node); + err = tnc_read_hashed_node(c, &znode->zbranch[n], node); out_unlock: mutex_unlock(&c->tnc_mutex); @@ -1830,14 +1831,14 @@ out_unlock: * @node: the node is returned here * @nm: node name * - * This function look up and reads a node which contains name hash in the key. + * This function looks up and reads a node which contains name hash in the key. * Since the hash may have collisions, there may be many nodes with the same * key, so we have to sequentially look to all of them until the needed one is * found. This function returns zero in case of success, %-ENOENT if the node * was not found, and a negative error code in case of failure. */ int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, - void *node, const struct qstr *nm) + void *node, const struct fscrypt_name *nm) { int err, len; const struct ubifs_dent_node *dent = node; @@ -1851,16 +1852,105 @@ int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, return err; len = le16_to_cpu(dent->nlen); - if (nm->len == len && !memcmp(dent->name, nm->name, len)) + if (fname_len(nm) == len && !memcmp(dent->name, fname_name(nm), len)) return 0; /* * Unluckily, there are hash collisions and we have to iterate over * them look at each direntry with colliding name hash sequentially. */ + return do_lookup_nm(c, key, node, nm); } +static int do_lookup_dh(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_dent_node *dent, uint32_t cookie) +{ + int n, err, type = key_type(c, key); + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + union ubifs_key *dkey, start_key; + + ubifs_assert(is_hash_key(c, key)); + + lowest_dent_key(c, &start_key, key_inum(c, key)); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, &start_key, &znode, &n); + if (unlikely(err < 0)) + goto out_unlock; + + for (;;) { + if (!err) { + err = tnc_next(c, &znode, &n); + if (err) + goto out_unlock; + } + + zbr = &znode->zbranch[n]; + dkey = &zbr->key; + + if (key_inum(c, dkey) != key_inum(c, key) || + key_type(c, dkey) != type) { + err = -ENOENT; + goto out_unlock; + } + + err = tnc_read_hashed_node(c, zbr, dent); + if (err) + goto out_unlock; + + if (key_hash(c, key) == key_hash(c, dkey) && + le32_to_cpu(dent->cookie) == cookie) + goto out_unlock; + } + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_lookup_dh - look up a "double hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @cookie: node cookie for collision resolution + * + * This function looks up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one + * with the same cookie value is found. + * This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key, + void *node, uint32_t cookie) +{ + int err; + const struct ubifs_dent_node *dent = node; + + if (!c->double_hash) + return -EOPNOTSUPP; + + /* + * We assume that in most of the cases there are no name collisions and + * 'ubifs_tnc_lookup()' returns us the right direntry. + */ + err = ubifs_tnc_lookup(c, key, node); + if (err) + return err; + + if (le32_to_cpu(dent->cookie) == cookie) + return 0; + + /* + * Unluckily, there are hash collisions and we have to iterate over + * them look at each direntry with colliding name hash sequentially. + */ + return do_lookup_dh(c, key, node, cookie); +} + /** * correct_parent_keys - correct parent znodes' keys. * @c: UBIFS file-system description object @@ -2279,14 +2369,15 @@ out_unlock: * may have collisions, like directory entry keys. */ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, - int lnum, int offs, int len, const struct qstr *nm) + int lnum, int offs, int len, + const struct fscrypt_name *nm) { int found, n, err = 0; struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnck(key, "LEB %d:%d, name '%.*s', key ", - lnum, offs, nm->len, nm->name); + //dbg_tnck(key, "LEB %d:%d, name '%.*s', key ", + // lnum, offs, nm->len, nm->name); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2344,7 +2435,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, * by passing 'ubifs_tnc_remove_nm()' the same key but * an unmatchable name. */ - struct qstr noname = { .name = "" }; + struct fscrypt_name noname = { .disk_name = { .name = "", .len = 1 } }; err = dbg_check_tnc(c, 0); mutex_unlock(&c->tnc_mutex); @@ -2514,13 +2605,13 @@ out_unlock: * Returns %0 on success or negative error code on failure. */ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, - const struct qstr *nm) + const struct fscrypt_name *nm) { int n, err; struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnck(key, "%.*s, key ", nm->len, nm->name); + //dbg_tnck(key, "%.*s, key ", nm->len, nm->name); err = lookup_level0_dirty(c, key, &znode, &n); if (err < 0) goto out_unlock; @@ -2669,7 +2760,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) { union ubifs_key key1, key2; struct ubifs_dent_node *xent, *pxent = NULL; - struct qstr nm = { .name = NULL }; + struct fscrypt_name nm = {0}; dbg_tnc("ino %lu", (unsigned long)inum); @@ -2694,8 +2785,8 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) dbg_tnc("xent '%s', ino %lu", xent->name, (unsigned long)xattr_inum); - nm.name = xent->name; - nm.len = le16_to_cpu(xent->nlen); + fname_name(&nm) = xent->name; + fname_len(&nm) = le16_to_cpu(xent->nlen); err = ubifs_tnc_remove_nm(c, &key1, &nm); if (err) { kfree(xent); @@ -2747,7 +2838,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) */ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, union ubifs_key *key, - const struct qstr *nm) + const struct fscrypt_name *nm) { int n, err, type = key_type(c, key); struct ubifs_znode *znode; @@ -2755,7 +2846,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, struct ubifs_zbranch *zbr; union ubifs_key *dkey; - dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)"); + //dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)"); ubifs_assert(is_hash_key(c, key)); mutex_lock(&c->tnc_mutex); @@ -2763,7 +2854,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, if (unlikely(err < 0)) goto out_unlock; - if (nm->name) { + if (fname_len(nm) > 0) { if (err) { /* Handle collisions */ err = resolve_collision(c, key, &znode, &n, nm); @@ -2813,7 +2904,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, goto out_free; } - err = tnc_read_node_nm(c, zbr, dent); + err = tnc_read_hashed_node(c, zbr, dent); if (unlikely(err)) goto out_free; diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index e24380cf46ed..e8c23c9d4f4a 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -46,7 +46,7 @@ * UBIFS went into mainline kernel with format version 4. The older formats * were development formats. */ -#define UBIFS_FORMAT_VERSION 4 +#define UBIFS_FORMAT_VERSION 5 /* * Read-only compatibility version. If the UBIFS format is changed, older UBIFS @@ -301,6 +301,13 @@ enum { #define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ /* + * xattr name of UBIFS encryption context, we don't use a prefix + * nor a long name to not waste space on the flash. + */ +#define UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT "c" + + +/* * On-flash inode flags. * * UBIFS_COMPR_FL: use compression for this inode @@ -309,6 +316,7 @@ enum { * UBIFS_APPEND_FL: writes to the inode may only append data * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value + * UBIFS_CRYPT_FL: use encryption for this inode * * Note, these are on-flash flags which correspond to ioctl flags * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not @@ -321,6 +329,7 @@ enum { UBIFS_APPEND_FL = 0x08, UBIFS_DIRSYNC_FL = 0x10, UBIFS_XATTR_FL = 0x20, + UBIFS_CRYPT_FL = 0x40, }; /* Inode flag bits used by UBIFS */ @@ -409,12 +418,19 @@ enum { * * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed + * UBIFS_FLG_DOUBLE_HASH: store a 32bit cookie in directory entry nodes to + * support 64bit cookies for lookups by hash + * UBIFS_FLG_ENCRYPTION: this filesystem contains encrypted files */ enum { UBIFS_FLG_BIGLPT = 0x02, UBIFS_FLG_SPACE_FIXUP = 0x04, + UBIFS_FLG_DOUBLE_HASH = 0x08, + UBIFS_FLG_ENCRYPTION = 0x10, }; +#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT|UBIFS_FLG_SPACE_FIXUP|UBIFS_FLG_DOUBLE_HASH|UBIFS_FLG_ENCRYPTION) + /** * struct ubifs_ch - common header node. * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC) @@ -521,7 +537,8 @@ struct ubifs_ino_node { * @padding1: reserved for future, zeroes * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc) * @nlen: name length - * @padding2: reserved for future, zeroes + * @cookie: A 32bits random number, used to construct a 64bits + * identifier. * @name: zero-terminated name * * Note, do not forget to amend 'zero_dent_node_unused()' function when @@ -534,7 +551,7 @@ struct ubifs_dent_node { __u8 padding1; __u8 type; __le16 nlen; - __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ + __le32 cookie; __u8 name[]; } __packed; @@ -544,18 +561,16 @@ struct ubifs_dent_node { * @key: node key * @size: uncompressed data size in bytes * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc) - * @padding: reserved for future, zeroes + * @compr_size: compressed data size in bytes, only valid when data is encrypted * @data: data * - * Note, do not forget to amend 'zero_data_node_unused()' function when - * changing the padding fields. */ struct ubifs_data_node { struct ubifs_ch ch; __u8 key[UBIFS_MAX_KEY_LEN]; __le32 size; __le16 compr_type; - __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ + __le16 compr_size; __u8 data[]; } __packed; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 096035eb29d0..ca72382ce6cc 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -38,6 +38,8 @@ #include <linux/backing-dev.h> #include <linux/security.h> #include <linux/xattr.h> +#include <linux/fscrypto.h> +#include <linux/random.h> #include "ubifs-media.h" /* Version of this UBIFS implementation */ @@ -83,10 +85,6 @@ */ #define BGT_NAME_PATTERN "ubifs_bgt%d_%d" -/* Write-buffer synchronization timeout interval in seconds */ -#define WBUF_TIMEOUT_SOFTLIMIT 3 -#define WBUF_TIMEOUT_HARDLIMIT 5 - /* Maximum possible inode number (only 32-bit inodes are supported now) */ #define MAX_INUM 0xFFFFFFFF @@ -138,6 +136,12 @@ */ #define WORST_COMPR_FACTOR 2 +#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE +#else +#define UBIFS_CIPHER_BLOCK_SIZE 0 +#endif + /* * How much memory is needed for a buffer where we compress a data node. */ @@ -645,9 +649,6 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @io_mutex: serializes write-buffer I/O * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes * fields - * @softlimit: soft write-buffer timeout interval - * @delta: hard and soft timeouts delta (the timer expire interval is @softlimit - * and @softlimit + @delta) * @timer: write-buffer timer * @no_timer: non-zero if this write-buffer does not have a timer * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing @@ -676,8 +677,6 @@ struct ubifs_wbuf { int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); struct mutex io_mutex; spinlock_t lock; - ktime_t softlimit; - unsigned long long delta; struct hrtimer timer; unsigned int no_timer:1; unsigned int need_sync:1; @@ -1007,6 +1006,8 @@ struct ubifs_debug_info; * * @big_lpt: flag that LPT is too big to write whole during commit * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up + * @double_hash: flag indicating that we can do lookups by hash + * @encrypted: flag indicating that this file system contains encrypted files * @no_chk_data_crc: do not check CRCs when reading data nodes (except during * recovery) * @bulk_read: enable bulk-reads @@ -1249,6 +1250,8 @@ struct ubifs_info { unsigned int big_lpt:1; unsigned int space_fixup:1; + unsigned int double_hash:1; + unsigned int encrypted:1; unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; @@ -1515,25 +1518,29 @@ int ubifs_consolidate_log(struct ubifs_info *c); /* journal.c */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, - const struct qstr *nm, const struct inode *inode, + const struct fscrypt_name *nm, const struct inode *inode, int deletion, int xent); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, - const struct dentry *fst_dentry, + const struct inode *fst_inode, + const struct fscrypt_name *fst_nm, const struct inode *snd_dir, - const struct dentry *snd_dentry, int sync); + const struct inode *snd_inode, + const struct fscrypt_name *snd_nm, int sync); int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, - const struct dentry *old_dentry, + const struct inode *old_inode, + const struct fscrypt_name *old_nm, const struct inode *new_dir, - const struct dentry *new_dentry, + const struct inode *new_inode, + const struct fscrypt_name *new_nm, const struct inode *whiteout, int sync); int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, loff_t old_size, loff_t new_size); int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, - const struct inode *inode, const struct qstr *nm); + const struct inode *inode, const struct fscrypt_name *nm); int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1, const struct inode *inode2); @@ -1568,7 +1575,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n); int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, - void *node, const struct qstr *nm); + void *node, const struct fscrypt_name *nm); +int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key, + void *node, uint32_t secondary_hash); int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, void *node, int *lnum, int *offs); int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, @@ -1576,16 +1585,16 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, int old_lnum, int old_offs, int lnum, int offs, int len); int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, - int lnum, int offs, int len, const struct qstr *nm); + int lnum, int offs, int len, const struct fscrypt_name *nm); int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key); int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, - const struct qstr *nm); + const struct fscrypt_name *nm); int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, union ubifs_key *to_key); int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum); struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, union ubifs_key *key, - const struct qstr *nm); + const struct fscrypt_name *nm); void ubifs_tnc_close(struct ubifs_info *c); int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, int lnum, int offs, int is_idx); @@ -1642,6 +1651,7 @@ int ubifs_read_superblock(struct ubifs_info *c); struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); int ubifs_fixup_free_space(struct ubifs_info *c); +int ubifs_enable_encryption(struct ubifs_info *c); /* replay.c */ int ubifs_validate_entry(struct ubifs_info *c, @@ -1733,16 +1743,21 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, int flags); #endif /* dir.c */ -struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, +struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode); int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int ubifs_check_dir_empty(struct inode *dir); /* xattr.c */ extern const struct xattr_handler *ubifs_xattr_handlers[]; ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); int ubifs_init_security(struct inode *dentry, struct inode *inode, const struct qstr *qstr); +int ubifs_xattr_set(struct inode *host, const char *name, const void *value, + size_t size, int flags); +ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, + size_t size); /* super.c */ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); @@ -1781,6 +1796,66 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, #include "misc.h" #include "key.h" +#ifndef CONFIG_UBIFS_FS_ENCRYPTION +#define fscrypt_set_d_op(i) +#define fscrypt_get_ctx fscrypt_notsupp_get_ctx +#define fscrypt_release_ctx fscrypt_notsupp_release_ctx +#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page +#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page +#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages +#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page +#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page +#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy +#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context +#define fscrypt_inherit_context fscrypt_notsupp_inherit_context +#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info +#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info +#define fscrypt_setup_filename fscrypt_notsupp_setup_filename +#define fscrypt_free_filename fscrypt_notsupp_free_filename +#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size +#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer +#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer +#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr +#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk +static inline int ubifs_encrypt(const struct inode *inode, + struct ubifs_data_node *dn, + unsigned int in_len, unsigned int *out_len, + int block) +{ + ubifs_assert(0); + return -EOPNOTSUPP; +} +static inline int ubifs_decrypt(const struct inode *inode, + struct ubifs_data_node *dn, + unsigned int *out_len, int block) +{ + ubifs_assert(0); + return -EOPNOTSUPP; +} +#else +/* crypto.c */ +int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, + unsigned int in_len, unsigned int *out_len, int block); +int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, + unsigned int *out_len, int block); +#endif + +extern struct fscrypt_operations ubifs_crypt_operations; + +static inline bool __ubifs_crypt_is_encrypted(struct inode *inode) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + return ui->flags & UBIFS_CRYPT_FL; +} + +static inline bool ubifs_crypt_is_encrypted(const struct inode *inode) +{ + return __ubifs_crypt_is_encrypted((struct inode *)inode); +} + /* Normal UBIFS messages */ __printf(2, 3) void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index d9f9615bfd71..efe00fcb8b75 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -97,7 +97,7 @@ static const struct file_operations empty_fops; * of failure. */ static int create_xattr(struct ubifs_info *c, struct inode *host, - const struct qstr *nm, const void *value, int size) + const struct fscrypt_name *nm, const void *value, int size) { int err, names_len; struct inode *inode; @@ -117,7 +117,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, * extended attributes if the name list becomes larger. This limitation * is artificial for UBIFS, though. */ - names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1; + names_len = host_ui->xattr_names + host_ui->xattr_cnt + fname_len(nm) + 1; if (names_len > XATTR_LIST_MAX) { ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d", host->i_ino, names_len, XATTR_LIST_MAX); @@ -154,9 +154,18 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, mutex_lock(&host_ui->ui_mutex); host->i_ctime = ubifs_current_time(host); host_ui->xattr_cnt += 1; - host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(size); - host_ui->xattr_names += nm->len; + host_ui->xattr_names += fname_len(nm); + + /* + * We handle UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT here because we + * have to set the UBIFS_CRYPT_FL flag on the host inode. + * To avoid multiple updates of the same inode in the same operation, + * let's do it here. + */ + if (strcmp(fname_name(nm), UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0) + host_ui->flags |= UBIFS_CRYPT_FL; err = ubifs_jnl_update(c, host, nm, inode, 0, 1); if (err) @@ -170,9 +179,10 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_cnt -= 1; - host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(size); - host_ui->xattr_names -= nm->len; + host_ui->xattr_names -= fname_len(nm); + host_ui->flags &= ~UBIFS_CRYPT_FL; mutex_unlock(&host_ui->ui_mutex); out_free: make_bad_inode(inode); @@ -269,22 +279,28 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) return ERR_PTR(-EINVAL); } -static int __ubifs_setxattr(struct inode *host, const char *name, - const void *value, size_t size, int flags) +int ubifs_xattr_set(struct inode *host, const char *name, const void *value, + size_t size, int flags) { struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = QSTR_INIT(name, strlen(name)); + struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))}; struct ubifs_dent_node *xent; union ubifs_key key; int err; - ubifs_assert(inode_is_locked(host)); + /* + * Creating an encryption context is done unlocked since we + * operate on a new inode which is not visible to other users + * at this point. + */ + if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) != 0) + ubifs_assert(inode_is_locked(host)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; - if (nm.len > UBIFS_MAX_NLEN) + if (fname_len(&nm) > UBIFS_MAX_NLEN) return -ENAMETOOLONG; xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); @@ -329,18 +345,18 @@ out_free: return err; } -static ssize_t __ubifs_getxattr(struct inode *host, const char *name, - void *buf, size_t size) +ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, + size_t size) { struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = QSTR_INIT(name, strlen(name)); + struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))}; struct ubifs_inode *ui; struct ubifs_dent_node *xent; union ubifs_key key; int err; - if (nm.len > UBIFS_MAX_NLEN) + if (fname_len(&nm) > UBIFS_MAX_NLEN) return -ENAMETOOLONG; xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); @@ -387,6 +403,20 @@ out_unlock: return err; } +static bool xattr_visible(const char *name) +{ + /* File encryption related xattrs are for internal use only */ + if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0) + return false; + + /* Show trusted namespace only for "power" users */ + if (strncmp(name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) == 0 && !capable(CAP_SYS_ADMIN)) + return false; + + return true; +} + ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) { union ubifs_key key; @@ -395,7 +425,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_dent_node *xent, *pxent = NULL; int err, len, written = 0; - struct qstr nm = { .name = NULL }; + struct fscrypt_name nm = {0}; dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino, dentry, size); @@ -419,15 +449,12 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) break; } - nm.name = xent->name; - nm.len = le16_to_cpu(xent->nlen); + fname_name(&nm) = xent->name; + fname_len(&nm) = le16_to_cpu(xent->nlen); - /* Show trusted namespace only for "power" users */ - if (strncmp(xent->name, XATTR_TRUSTED_PREFIX, - XATTR_TRUSTED_PREFIX_LEN) || - capable(CAP_SYS_ADMIN)) { - memcpy(buffer + written, nm.name, nm.len + 1); - written += nm.len + 1; + if (xattr_visible(xent->name)) { + memcpy(buffer + written, fname_name(&nm), fname_len(&nm) + 1); + written += fname_len(&nm) + 1; } kfree(pxent); @@ -446,7 +473,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) } static int remove_xattr(struct ubifs_info *c, struct inode *host, - struct inode *inode, const struct qstr *nm) + struct inode *inode, const struct fscrypt_name *nm) { int err; struct ubifs_inode *host_ui = ubifs_inode(host); @@ -463,9 +490,9 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, mutex_lock(&host_ui->ui_mutex); host->i_ctime = ubifs_current_time(host); host_ui->xattr_cnt -= 1; - host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); - host_ui->xattr_names -= nm->len; + host_ui->xattr_names -= fname_len(nm); err = ubifs_jnl_delete_xattr(c, host, inode, nm); if (err) @@ -477,27 +504,27 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_cnt += 1; - host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); - host_ui->xattr_names += nm->len; + host_ui->xattr_names += fname_len(nm); mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); make_bad_inode(inode); return err; } -static int __ubifs_removexattr(struct inode *host, const char *name) +static int ubifs_xattr_remove(struct inode *host, const char *name) { struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = QSTR_INIT(name, strlen(name)); + struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))}; struct ubifs_dent_node *xent; union ubifs_key key; int err; ubifs_assert(inode_is_locked(host)); - if (nm.len > UBIFS_MAX_NLEN) + if (fname_len(&nm) > UBIFS_MAX_NLEN) return -ENAMETOOLONG; xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); @@ -548,7 +575,8 @@ static int init_xattrs(struct inode *inode, const struct xattr *xattr_array, } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); - err = __ubifs_setxattr(inode, name, xattr->value, xattr->value_len, 0); + err = ubifs_xattr_set(inode, name, xattr->value, + xattr->value_len, 0); kfree(name); if (err < 0) break; @@ -572,7 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, return err; } -static int ubifs_xattr_get(const struct xattr_handler *handler, +static int xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { @@ -580,10 +608,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler, inode->i_ino, dentry, size); name = xattr_full_name(handler, name); - return __ubifs_getxattr(inode, name, buffer, size); + return ubifs_xattr_get(inode, name, buffer, size); } -static int ubifs_xattr_set(const struct xattr_handler *handler, +static int xattr_set(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -594,27 +622,27 @@ static int ubifs_xattr_set(const struct xattr_handler *handler, name = xattr_full_name(handler, name); if (value) - return __ubifs_setxattr(inode, name, value, size, flags); + return ubifs_xattr_set(inode, name, value, size, flags); else - return __ubifs_removexattr(inode, name); + return ubifs_xattr_remove(inode, name); } static const struct xattr_handler ubifs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, - .get = ubifs_xattr_get, - .set = ubifs_xattr_set, + .get = xattr_get, + .set = xattr_set, }; static const struct xattr_handler ubifs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .get = ubifs_xattr_get, - .set = ubifs_xattr_set, + .get = xattr_get, + .set = xattr_set, }; static const struct xattr_handler ubifs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = ubifs_xattr_get, - .set = ubifs_xattr_set, + .get = xattr_get, + .set = xattr_set, }; const struct xattr_handler *ubifs_xattr_handlers[] = { diff --git a/fs/udf/dir.c b/fs/udf/dir.c index aaec13c95253..2d0e028067eb 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -30,6 +30,7 @@ #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/bio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 988d5352bdb8..7aa48bd7cbaf 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -16,6 +16,7 @@ #include <linux/fs.h> #include <linux/string.h> +#include <linux/bio.h> struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, struct udf_fileident_bh *fibh, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index aad46401ede5..0f3db71753aa 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,6 +38,7 @@ #include <linux/crc-itu-t.h> #include <linux/mpage.h> #include <linux/uio.h> +#include <linux/bio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 67e085d591d8..a0376a2c1c29 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/bitops.h> +#include <linux/bio.h> #include <asm/byteorder.h> #include "ufs_fs.h" @@ -306,8 +307,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, (unsigned long long)(pos + newb), pos); bh->b_blocknr = newb + pos; - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); mark_buffer_dirty(bh); ++j; bh = bh->b_this_page; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 190d64be22ed..45ceb94e89e4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1070,8 +1070,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) if (buffer_new(bh)) { clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); /* * we do not zeroize fragment, because of * if it maped to hole, it already contains zeroes diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 85959d8324df..d96e2f30084b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -257,9 +257,9 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct fault_env *fe, unsigned long reason) +int handle_userfault(struct vm_fault *vmf, unsigned long reason) { - struct mm_struct *mm = fe->vma->vm_mm; + struct mm_struct *mm = vmf->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; @@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); ret = VM_FAULT_SIGBUS; - ctx = fe->vma->vm_userfaultfd_ctx.ctx; + ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ - if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); + BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); + "FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); dump_stack(); } #endif @@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * and wait. */ ret = VM_FAULT_RETRY; - if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; /* take the reference before dropping the mmap_sem */ @@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(fe->address, fe->flags, reason); + uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); uwq.ctx = ctx; return_to_userland = - (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); spin_lock(&ctx->fault_pending_wqh.lock); @@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) TASK_KILLABLE); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); + must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, + reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index effb64cf714f..5050056a0b06 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2455,12 +2455,15 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) return false; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) return false; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return false; /* @@ -2477,7 +2480,8 @@ xfs_agf_verify( return false; if (xfs_sb_version_hasreflink(&mp->m_sb) && - be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return false; return true;; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 5ba2dac5e67c..efb467b10a71 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -421,13 +421,17 @@ xfs_allocbt_init_cursor( ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_allocbt_ops; + if (btnum == XFS_BTNUM_BNO) + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + else + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); if (btnum == XFS_BTNUM_CNT) { cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 8ea91f363093..2852521fc8ec 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -253,6 +253,7 @@ xfs_attr3_leaf_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_perag *pag = bp->b_pag; struct xfs_attr3_icleaf_hdr ichdr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); @@ -273,7 +274,12 @@ xfs_attr3_leaf_verify( if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) return false; } - if (ichdr.count == 0) + /* + * In recovery there is a transient state where count == 0 is valid + * because we may have transitioned an empty shortform attr to a leaf + * if the attr didn't fit in shortform. + */ + if (pag && pag->pagf_init && ichdr.count == 0) return false; /* XXX: need to range check rest of attr header values */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 4f2aed04f827..f7dda0c237b0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -51,7 +51,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); -int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); +int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* @@ -77,7 +77,7 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr3_leaf_list_int(struct xfs_buf *bp, +void xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c6eb21940783..2760bc3b2536 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -49,6 +49,8 @@ #include "xfs_rmap.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" +#include "xfs_rmap_btree.h" +#include "xfs_icache.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -190,8 +192,12 @@ xfs_bmap_worst_indlen( int maxrecs; /* maximum record count at this level */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t rval; /* return value */ + xfs_filblks_t orig_len; mp = ip->i_mount; + + /* Calculate the worst-case size of the bmbt. */ + orig_len = len; maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); @@ -199,12 +205,20 @@ xfs_bmap_worst_indlen( len += maxrecs - 1; do_div(len, maxrecs); rval += len; - if (len == 1) - return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + if (len == 1) { + rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - level - 1; + break; + } if (level == 0) maxrecs = mp->m_bmap_dmxr[1]; } + + /* Calculate the worst-case size of the rmapbt. */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) + + mp->m_rmap_maxlevels; + return rval; } @@ -504,7 +518,7 @@ void xfs_bmap_trace_exlist( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ + int whichfork, /* data or attr or cow fork */ unsigned long caller_ip) { xfs_extnum_t idx; /* extent record index */ @@ -513,11 +527,13 @@ xfs_bmap_trace_exlist( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(cnt == xfs_iext_count(ifp)); for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); + trace_xfs_extlist(ip, idx, state, caller_ip); } /* @@ -811,7 +827,7 @@ try_another_ag: XFS_BTREE_LONG_PTRS); arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (cnt = i = 0; i < nextents; i++) { ep = xfs_iext_get_ext(ifp, i); if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { @@ -1137,6 +1153,10 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; + if (ip->i_d.di_anextents != 0) { + error = -EFSCORRUPTED; + goto trans_cancel; + } if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1144,7 +1164,6 @@ xfs_bmap_add_attrfork( ASSERT(ip->i_d.di_aformat == 0); ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } - ASSERT(ip->i_d.di_anextents == 0); xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1296,7 +1315,7 @@ xfs_bmap_read_extents( /* * Here with bp and block set to the leftmost leaf node in the tree. */ - room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + room = xfs_iext_count(ifp); i = 0; /* * Loop over all leaf nodes. Copy information to the extent records. @@ -1361,8 +1380,9 @@ xfs_bmap_read_extents( return error; block = XFS_BUF_TO_BLOCK(bp); } - ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) + return -EFSCORRUPTED; + ASSERT(i == xfs_iext_count(ifp)); XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); return 0; error0: @@ -1370,97 +1390,6 @@ error0: return -EFSCORRUPTED; } - -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_multi_extents( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t lastx; /* last extent index */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; - gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; - gotp->br_state = XFS_EXT_INVALID; - gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; - prevp->br_startoff = NULLFILEOFF; - - ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); - if (lastx > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); - } - if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, gotp); - *eofp = 0; - } else { - if (lastx > 0) { - *gotp = *prevp; - } - *eofp = 1; - ep = NULL; - } - *lastxp = lastx; - return ep; -} - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int fork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - - XFS_STATS_INC(ip->i_mount, xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, fork); - - ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - - if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && - !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x lastx: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)gotp->br_startblock, - (unsigned long long)gotp->br_startoff, - (unsigned long long)gotp->br_blockcount, - gotp->br_state, *lastxp); - *lastxp = NULLEXTNUM; - *eofp = 1; - return NULL; - } - return ep; -} - /* * Returns the file-relative block number of the first unused block(s) * in the file with at least "len" logically contiguous blocks free. @@ -1497,7 +1426,7 @@ xfs_bmap_first_unused( (error = xfs_iread_extents(tp, ip, whichfork))) return error; lowest = *first_unused; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); off = xfs_bmbt_get_startoff(ep); @@ -1523,44 +1452,44 @@ xfs_bmap_first_unused( */ int /* error */ xfs_bmap_last_before( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ { - xfs_fileoff_t bno; /* input file offset */ - int eof; /* hit end of file */ - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_bmbt_irec_t got; /* current extent value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent used */ - xfs_bmbt_irec_t prev; /* previous extent value */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + xfs_extnum_t idx; + int error; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return -EIO; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: *last_block = 0; return 0; + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_EXTENTS: + break; + default: + return -EIO; } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - bno = *last_block - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - if (eof || xfs_bmbt_get_startoff(ep) > bno) { - if (prev.br_startoff == NULLFILEOFF) - *last_block = 0; - else - *last_block = prev.br_startoff + prev.br_blockcount; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; } - /* - * Otherwise *last_block is already the right answer. - */ + + if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) { + if (got.br_startoff <= *last_block - 1) + return 0; + } + + if (xfs_iext_get_extent(ifp, idx - 1, &got)) { + *last_block = got.br_startoff + got.br_blockcount; + return 0; + } + + *last_block = 0; return 0; } @@ -1582,7 +1511,7 @@ xfs_bmap_last_extent( return error; } - nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { *is_empty = 1; return 0; @@ -1735,7 +1664,7 @@ xfs_bmap_add_extent_delay_real( &bma->ip->i_d.di_nextents); ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -1794,7 +1723,7 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < xfs_iext_count(ifp) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); @@ -2300,7 +2229,7 @@ xfs_bmap_add_extent_unwritten_real( ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ASSERT(*idx >= 0); - ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2356,7 +2285,7 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (*idx < xfs_iext_count(&ip->i_df) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) @@ -2836,7 +2765,7 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); @@ -2966,7 +2895,7 @@ xfs_bmap_add_extent_hole_real( ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -2992,7 +2921,7 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (bma->idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); if (isnullstartblock(right.br_startblock)) @@ -4145,12 +4074,11 @@ xfs_bmapi_read( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec prev; xfs_fileoff_t obno; xfs_fileoff_t end; - xfs_extnum_t lastx; + xfs_extnum_t idx; int error; - int eof; + bool eof = false; int n = 0; int whichfork = xfs_bmapi_whichfork(flags); @@ -4190,7 +4118,8 @@ xfs_bmapi_read( return error; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) + eof = true; end = bno + len; obno = bno; @@ -4221,10 +4150,8 @@ xfs_bmapi_read( break; /* Else go on to the next record. */ - if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); - else - eof = 1; + if (!xfs_iext_get_extent(ifp, ++idx, &got)) + eof = true; } *nmap = n; return 0; @@ -4234,10 +4161,10 @@ int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, int whichfork, - xfs_fileoff_t aoff, + xfs_fileoff_t off, xfs_filblks_t len, + xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof) { @@ -4248,10 +4175,17 @@ xfs_bmapi_reserve_delalloc( char rt = XFS_IS_REALTIME_INODE(ip); xfs_extlen_t extsz; int error; + xfs_fileoff_t aoff = off; - alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ + alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; /* Figure out the extent size, adjust alen */ if (whichfork == XFS_COW_FORK) @@ -4259,7 +4193,12 @@ xfs_bmapi_reserve_delalloc( else extsz = xfs_get_extsz_hint(ip); if (extsz) { - error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + struct xfs_bmbt_irec prev; + + if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev)) + prev.br_startoff = NULLFILEOFF; + + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof, 1, 0, &aoff, &alen); ASSERT(!error); } @@ -4312,6 +4251,16 @@ xfs_bmapi_reserve_delalloc( */ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + /* + * Tag the inode if blocks were preallocated. Note that COW fork + * preallocation can occur at the start or end of the extent, even when + * prealloc == 0, so we must also check the aligned offset and length. + */ + if (whichfork == XFS_DATA_FORK && prealloc) + xfs_inode_set_eofblocks_tag(ip); + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + ASSERT(got->br_startoff <= aoff); ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); ASSERT(isnullstartblock(got->br_startblock)); @@ -4349,7 +4298,7 @@ xfs_bmapi_allocate( if (bma->wasdel) { bma->length = (xfs_extlen_t)bma->got.br_blockcount; bma->offset = bma->got.br_startoff; - if (bma->idx != NULLEXTNUM && bma->idx) { + if (bma->idx) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &bma->prev); } @@ -4563,7 +4512,7 @@ xfs_bmapi_write( struct xfs_ifork *ifp; struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ - int eof; /* after the end of extents */ + bool eof = false; /* after the end of extents */ int error; /* error return */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ @@ -4641,12 +4590,14 @@ xfs_bmapi_write( goto error0; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, - &bma.prev); n = 0; end = bno + len; obno = bno; + if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got)) + eof = true; + if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; bma.tp = tp; bma.ip = ip; bma.total = total; @@ -4733,11 +4684,8 @@ xfs_bmapi_write( /* Else go on to the next record. */ bma.prev = bma.got; - if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), - &bma.got); - } else - eof = 1; + if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got)) + eof = true; } *nmap = n; @@ -4885,7 +4833,7 @@ xfs_bmap_del_extent_delay( da_new = 0; ASSERT(*idx >= 0); - ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -4902,8 +4850,11 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0, + error = xfs_trans_reserve_quota_nblks(NULL, ip, + -((long)del->br_blockcount), 0, isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; ip->i_delayed_blks -= del->br_blockcount; if (whichfork == XFS_COW_FORK) @@ -5013,7 +4964,7 @@ xfs_bmap_del_extent_cow( got_endoff = got->br_startoff + got->br_blockcount; ASSERT(*idx >= 0); - ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -5119,8 +5070,7 @@ xfs_bmap_del_extent( state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp))); ASSERT(del->br_blockcount > 0); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &got); @@ -5434,8 +5384,6 @@ __xfs_bunmapi( { xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ - int eof; /* is deleting at eof */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ xfs_extnum_t extno; /* extent number in list */ xfs_bmbt_irec_t got; /* current extent record */ @@ -5445,8 +5393,6 @@ __xfs_bunmapi( int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_irec_t prev; /* previous extent record */ xfs_fileoff_t start; /* first file offset deleted */ int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ @@ -5477,8 +5423,7 @@ __xfs_bunmapi( if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { + if (xfs_iext_count(ifp) == 0) { *rlen = 0; return 0; } @@ -5486,18 +5431,17 @@ __xfs_bunmapi( isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); start = bno; bno = start + len - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); /* * Check to see if the given block number is past the end of the * file, back up to the last block if so... */ - if (eof) { - ep = xfs_iext_get_ext(ifp, --lastx); - xfs_bmbt_get_all(ep, &got); + if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) { + ASSERT(lastx > 0); + xfs_iext_get_extent(ifp, --lastx, &got); bno = got.br_startoff + got.br_blockcount - 1; } + logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); @@ -5528,8 +5472,7 @@ __xfs_bunmapi( if (got.br_startoff > bno) { if (--lastx < 0) break; - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, lastx, &got); } /* * Is the last block of this extent before the range @@ -5543,7 +5486,6 @@ __xfs_bunmapi( * Then deal with the (possibly delayed) allocated space * we found. */ - ASSERT(ep != NULL); del = got; wasdel = isnullstartblock(del.br_startblock); if (got.br_startoff < start) { @@ -5624,15 +5566,12 @@ __xfs_bunmapi( */ ASSERT(bno >= del.br_blockcount); bno -= del.br_blockcount; - if (got.br_startoff > bno) { - if (--lastx >= 0) { - ep = xfs_iext_get_ext(ifp, - lastx); - xfs_bmbt_get_all(ep, &got); - } - } + if (got.br_startoff > bno && --lastx >= 0) + xfs_iext_get_extent(ifp, lastx, &got); continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec prev; + /* * This one is already unwritten. * It must have a written left neighbor. @@ -5640,8 +5579,7 @@ __xfs_bunmapi( * try again. */ ASSERT(lastx > 0); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, - lastx - 1), &prev); + xfs_iext_get_extent(ifp, lastx - 1, &prev); ASSERT(prev.br_state == XFS_EXT_NORM); ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == @@ -5739,13 +5677,9 @@ nodelete: */ if (bno != (xfs_fileoff_t)-1 && bno >= start) { if (lastx >= 0) { - ep = xfs_iext_get_ext(ifp, lastx); - if (xfs_bmbt_get_startoff(ep) > bno) { - if (--lastx >= 0) - ep = xfs_iext_get_ext(ifp, - lastx); - } - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, lastx, &got); + if (got.br_startoff > bno && --lastx >= 0) + xfs_iext_get_extent(ifp, lastx, &got); } extno++; } @@ -5963,7 +5897,7 @@ xfs_bmse_shift_one( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); xfs_bmbt_get_all(gotp, &got); @@ -6140,7 +6074,7 @@ xfs_bmap_shift_extents( * are collapsing out, so we cannot use the count of real extents here. * Instead we have to calculate it from the incore fork. */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); if (total_extents == 0) { *done = 1; goto del_cursor; @@ -6200,7 +6134,7 @@ xfs_bmap_shift_extents( * count can change. Update the total and grade the next record. */ if (direction == SHIFT_LEFT) { - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); stop_extent = total_extents; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 7cae6ec27fa6..cecd094404cc 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -237,14 +237,9 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); -struct xfs_bmbt_rec_host * - xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, - int fork, int *eofp, xfs_extnum_t *lastxp, - struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t aoff, xfs_filblks_t len, - struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev, - xfs_extnum_t *lastx, int eof); + xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, + struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 8007d2ba9aef..d6330c297ca0 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -796,13 +796,14 @@ xfs_bmbt_init_cursor( struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_btnum = XFS_BTNUM_BMAP; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0e80993c8a59..21e6a6ab6b9a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block( if (error) return error; + /* Check the inode owner since the verifiers don't. */ + if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + (cur->bc_flags & XFS_BTREE_LONG_PTRS) && + be64_to_cpu((*blkp)->bb_u.l.bb_owner) != + cur->bc_private.b.ip->i_ino) + goto out_bad; + + /* Did we get the level we were looking for? */ + if (be16_to_cpu((*blkp)->bb_level) != level) + goto out_bad; + + /* Check that internal nodes have at least one record. */ + if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) + goto out_bad; + xfs_btree_setbuf(cur, level, bp); return 0; + +out_bad: + *blkp = NULL; + xfs_trans_brelse(cur->bc_tp, bp); + return -EFSCORRUPTED; } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c2b01d1c79ee..b69b947c4c1b 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -96,46 +96,10 @@ union xfs_btree_rec { /* * Generic stats interface */ -#define __XFS_BTREE_STATS_INC(mp, type, stat) \ - XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat) #define XFS_BTREE_STATS_INC(cur, stat) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ - case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ - case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) - -#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \ - XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val) -#define XFS_BTREE_STATS_ADD(cur, stat, val) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: \ - __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \ - case XFS_BTNUM_CNT: \ - __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \ - case XFS_BTNUM_BMAP: \ - __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \ - case XFS_BTNUM_INO: \ - __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ - case XFS_BTNUM_FINO: \ - __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ - case XFS_BTNUM_RMAP: \ - __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ - case XFS_BTNUM_REFC: \ - __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) + XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ + XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) #define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ @@ -253,6 +217,7 @@ typedef struct xfs_btree_cur __uint8_t bc_nlevels; /* number of levels in the tree */ __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ + int bc_statoff; /* offset of btre stats array */ union { struct { /* needed for BNO, CNT, INO */ struct xfs_buf *agbp; /* agf/agi buffer pointer */ diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index fad1676ad8cd..a416c7cb23ea 100644 --- a/fs/xfs/libxfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h @@ -6,10 +6,11 @@ /* * Calculate the intermediate checksum for a buffer that has the CRC field * inside it. The offset of the 32bit crc fields is passed as the - * cksum_offset parameter. + * cksum_offset parameter. We do not modify the buffer during verification, + * hence we have to split the CRC calculation across the cksum_offset. */ static inline __uint32_t -xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) { __uint32_t zero = 0; __uint32_t crc; @@ -26,6 +27,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) } /* + * Fast CRC method where the buffer is modified. Callers must have exclusive + * access to the buffer while the calculation takes place. + */ +static inline __uint32_t +xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) +{ + /* zero the CRC field */ + *(__le32 *)(buffer + cksum_offset) = 0; + + /* single pass CRC calculation for the entire buffer */ + return crc32c(XFS_CRC_SEED, buffer, length); +} + +/* * Convert the intermediate checksum to the final ondisk format. * * The CRC32c calculation uses LE format even on BE machines, but returns the @@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc) /* * Helper to generate the checksum for a buffer. + * + * This modifies the buffer temporarily - callers must have exclusive + * access to the buffer while the calculation takes place. */ static inline void xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); } @@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) static inline int xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 20a96dd5af7e..c58d72c220f5 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -93,7 +93,7 @@ xfs_ascii_ci_compname( return result; } -static struct xfs_nameops xfs_ascii_ci_nameops = { +static const struct xfs_nameops xfs_ascii_ci_nameops = { .hashname = xfs_ascii_ci_hashname, .compname = xfs_ascii_ci_compname, }; diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index becc926c3e3d..0197590fa7d7 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -157,6 +157,9 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); +extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, + struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, @@ -177,6 +180,8 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup); +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); + extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 725fc7841fde..d478065b9544 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -329,7 +329,7 @@ xfs_dir3_data_read( err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } @@ -505,8 +505,9 @@ xfs_dir2_data_freeremove( * Given a data block, reconstruct its bestfree map. */ void -xfs_dir2_data_freescan( - struct xfs_inode *dp, +xfs_dir2_data_freescan_int( + struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, struct xfs_dir2_data_hdr *hdr, int *loghead) { @@ -516,7 +517,6 @@ xfs_dir2_data_freescan( struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ - struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -526,13 +526,13 @@ xfs_dir2_data_freescan( /* * Start by clearing the table. */ - bf = dp->d_ops->data_bestfree_p(hdr); + bf = ops->data_bestfree_p(hdr); memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)dp->d_ops->data_entry_p(hdr); + p = (char *)ops->data_entry_p(hdr); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { btp = xfs_dir2_block_tail_p(geo, hdr); @@ -559,12 +559,22 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); - p += dp->d_ops->data_entsize(dep->namelen); + be16_to_cpu(*ops->data_entry_tag_p(dep))); + p += ops->data_entsize(dep->namelen); } } } +void +xfs_dir2_data_freescan( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) +{ + return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops, + hdr, loghead); +} + /* * Initialize a data block at the given block number in the directory. * Give back the buffer for the created block. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index ef9f6ead96a4..d04547fcf274 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -21,7 +21,6 @@ struct dir_context; /* xfs_dir2.c */ -extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 51b4e0de1fdc..f272abff11e1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2344,7 +2344,8 @@ xfs_imap( imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << + mp->m_sb.sb_inodelog); return 0; } @@ -2372,7 +2373,7 @@ out_map: imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* * If the inode number maps to a block outside the bounds @@ -2450,8 +2451,6 @@ xfs_ialloc_log_agi( ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); - /* * Compute byte offsets for the first and last fields in the first * region and log the agi buffer. This only logs up through @@ -2512,8 +2511,15 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return false; - if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + if (be32_to_cpu(agi->agi_level) < 1 || + be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + return false; + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + (be32_to_cpu(agi->agi_free_level) < 1 || + be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2592,6 +2598,8 @@ xfs_read_agi( XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; + if (tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); xfs_buf_set_ref(*bpp, XFS_AGI_REF); return 0; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index eab68ae2e011..0fd086d03d41 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -357,7 +357,7 @@ xfs_inobt_init_cursor( struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; @@ -365,9 +365,11 @@ xfs_inobt_init_cursor( if (btnum == XFS_BTNUM_INO) { cur->bc_nlevels = be32_to_cpu(agi->agi_level); cur->bc_ops = &xfs_inobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); } else { cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); cur->bc_ops = &xfs_finobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); } cur->bc_blocklog = mp->m_sb.sb_blocklog; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 134424fac434..dd483e2767f7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -383,7 +383,7 @@ xfs_log_dinode_to_disk( static bool xfs_dinode_verify( struct xfs_mount *mp, - struct xfs_inode *ip, + xfs_ino_t ino, struct xfs_dinode *dip) { uint16_t flags; @@ -392,6 +392,14 @@ xfs_dinode_verify( if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; + /* don't allow invalid i_size */ + if (be64_to_cpu(dip->di_size) & (1ULL << 63)) + return false; + + /* No zero-length symlinks. */ + if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0) + return false; + /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) return true; @@ -401,7 +409,7 @@ xfs_dinode_verify( if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) return false; - if (be64_to_cpu(dip->di_ino) != ip->i_ino) + if (be64_to_cpu(dip->di_ino) != ino) return false; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; @@ -436,7 +444,7 @@ xfs_dinode_calc_crc( return; ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); - crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } @@ -493,7 +501,7 @@ xfs_iread( return error; /* even unallocated inodes are verified */ - if (!xfs_dinode_verify(mp, ip, dip)) { + if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { xfs_alert(mp, "%s: validation failed for inode %lld failed", __func__, ip->i_ino); diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 3cfe12a4f58a..6848a0afbce7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -58,8 +58,8 @@ struct xfs_icdinode { */ struct xfs_imap { xfs_daddr_t im_blkno; /* starting BB of inode chunk */ - ushort im_len; /* length in BBs of inode chunk */ - ushort im_boffset; /* inode offset in block in bytes */ + unsigned short im_len; /* length in BBs of inode chunk */ + unsigned short im_boffset; /* inode offset in block in bytes */ }; int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5dd56d3dbb3a..222e103356c6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -775,6 +775,13 @@ xfs_idestroy_fork( } } +/* Count number of incore extents based on if_bytes */ +xfs_extnum_t +xfs_iext_count(struct xfs_ifork *ifp) +{ + return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); +} + /* * Convert in-core extents to on-disk form * @@ -803,7 +810,7 @@ xfs_iextents_copy( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); - nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nrecs = xfs_iext_count(ifp); XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); ASSERT(nrecs > 0); @@ -941,7 +948,7 @@ xfs_iext_get_ext( xfs_extnum_t idx) /* index of target extent */ { ASSERT(idx >= 0); - ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(idx < xfs_iext_count(ifp)); if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { return ifp->if_u1.if_ext_irec->er_extbuf; @@ -1017,7 +1024,7 @@ xfs_iext_add( int new_size; /* size of extents after adding */ xfs_extnum_t nextents; /* number of extents in file */ - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT((idx >= 0) && (idx <= nextents)); byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); new_size = ifp->if_bytes + byte_diff; @@ -1241,7 +1248,7 @@ xfs_iext_remove( trace_xfs_iext_remove(ip, idx, state, _RET_IP_); ASSERT(ext_diff > 0); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); if (new_size == 0) { @@ -1270,7 +1277,7 @@ xfs_iext_remove_inline( ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); ASSERT(idx < XFS_INLINE_EXTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(((nextents - ext_diff) > 0) && (nextents - ext_diff) < XFS_INLINE_EXTS); @@ -1309,7 +1316,7 @@ xfs_iext_remove_direct( ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); new_size = ifp->if_bytes - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (new_size == 0) { xfs_iext_destroy(ifp); @@ -1546,7 +1553,7 @@ xfs_iext_indirect_to_direct( int size; /* size of file extents */ ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); @@ -1620,7 +1627,7 @@ xfs_iext_bno_to_ext( xfs_extnum_t nextents; /* number of file extents */ xfs_fileoff_t startoff = 0; /* start offset of extent */ - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { *idxp = 0; return NULL; @@ -1733,8 +1740,8 @@ xfs_iext_idx_to_irec( ASSERT(ifp->if_flags & XFS_IFEXTIREC); ASSERT(page_idx >= 0); - ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + ASSERT(page_idx <= xfs_iext_count(ifp)); + ASSERT(page_idx < xfs_iext_count(ifp) || realloc); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; erp_idx = 0; @@ -1782,7 +1789,7 @@ xfs_iext_irec_init( xfs_extnum_t nextents; /* number of extents in file */ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(nextents <= XFS_LINEAR_EXTS); erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); @@ -1906,7 +1913,7 @@ xfs_iext_irec_compact( ASSERT(ifp->if_flags & XFS_IFEXTIREC); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { xfs_iext_destroy(ifp); @@ -1996,3 +2003,49 @@ xfs_ifork_init_cow( ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; } + +/* + * Lookup the extent covering bno. + * + * If there is an extent covering bno return the extent index, and store the + * expanded extent structure in *gotp, and the extent index in *idx. + * If there is no extent covering bno, but there is an extent after it (e.g. + * it lies in a hole) return that extent in *gotp and its index in *idx + * instead. + * If bno is beyond the last extent return false, and return the index after + * the last valid index in *idxp. + */ +bool +xfs_iext_lookup_extent( + struct xfs_inode *ip, + struct xfs_ifork *ifp, + xfs_fileoff_t bno, + xfs_extnum_t *idxp, + struct xfs_bmbt_irec *gotp) +{ + struct xfs_bmbt_rec_host *ep; + + XFS_STATS_INC(ip->i_mount, xs_look_exlist); + + ep = xfs_iext_bno_to_ext(ifp, bno, idxp); + if (!ep) + return false; + xfs_bmbt_get_all(ep, gotp); + return true; +} + +/* + * Return true if there is an extent at index idx, and return the expanded + * extent structure at idx in that case. Else return false. + */ +bool +xfs_iext_get_extent( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp) +{ + if (idx < 0 || idx >= xfs_iext_count(ifp)) + return false; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); + return true; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index c9476f50e32d..7fb8365326d1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -152,6 +152,7 @@ void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); struct xfs_bmbt_rec_host * xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); +xfs_extnum_t xfs_iext_count(struct xfs_ifork *); void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, struct xfs_bmbt_irec *, int); void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); @@ -181,6 +182,12 @@ void xfs_iext_irec_compact_pages(struct xfs_ifork *); void xfs_iext_irec_compact_full(struct xfs_ifork *); void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); +bool xfs_iext_lookup_extent(struct xfs_inode *ip, + struct xfs_ifork *ifp, xfs_fileoff_t bno, + xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); +bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp); + extern struct kmem_zone *xfs_ifork_zone; extern void xfs_ifork_init_cow(struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 083cdd6d6c28..7ae571f8e34a 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -481,8 +481,8 @@ static inline uint xfs_log_dinode_size(int version) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ + unsigned short blf_flags; /* misc state */ + unsigned short blf_len; /* number of blocks in this buf */ __int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 8e385f91d660..d9f65e2d5cc8 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -52,7 +52,7 @@ typedef struct xlog_recover { struct list_head r_itemq; /* q for items */ } xlog_recover_t; -#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) +#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) /* * This is the number of entries in the l_buf_cancel_table used during diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 453bb2757ec2..6fb2215f8ff7 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor( cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_refcountbt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 83e672ff7577..de25771764ba 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_rmapbt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index e2e1106c9fad..ea45584a9913 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1016,4 +1016,3 @@ xfs_rtfree_extent( } return 0; } - diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a70aec910626..2580262e4ea0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -262,6 +262,12 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + if (xfs_sb_version_hascrc(&mp->m_sb) && + sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { + xfs_notice(mp, "v5 SB sanity check failed"); + return -EFSCORRUPTED; + } + /* * Until this is fixed only page-sized or smaller data blocks work. */ @@ -338,13 +344,16 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); - if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + if (sbp->sb_qflags & XFS_PQUOTA_ACCT && + sbp->sb_gquotino != NULLFSINO) { /* * In older version of superblock, on-disk superblock only * has sb_gquotino, and in-core superblock has both sb_gquotino * and sb_pquotino. But, only one of them is supported at any * point of time. So, if PQUOTA is set in disk superblock, - * copy over sb_gquotino to sb_pquotino. + * copy over sb_gquotino to sb_pquotino. The NULLFSINO test + * above is to make sure we don't do this twice and wipe them + * both out! */ sbp->sb_pquotino = sbp->sb_gquotino; sbp->sb_gquotino = NULLFSINO; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 8d74870468c2..717909f2f7b7 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -57,7 +57,6 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ #define NULLAGBLOCK ((xfs_agblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) -#define NULLEXTNUM ((xfs_extnum_t)-1) #define NULLCOMMITLSN ((xfs_lsn_t)-1) @@ -75,11 +74,14 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). + * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes + * cannot be used. */ #define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ #define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) #define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_CRC_BLOCKSIZE (1 << (XFS_MIN_BLOCKSIZE_LOG + 1)) #define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ #define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3e57a56cf829..0f56fcd3a5d5 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -37,11 +37,6 @@ #include <linux/pagevec.h> #include <linux/writeback.h> -/* flags for direct write completions */ -#define XFS_DIO_FLAG_UNWRITTEN (1 << 0) -#define XFS_DIO_FLAG_APPEND (1 << 1) -#define XFS_DIO_FLAG_COW (1 << 2) - /* * structure owned by writepages passed to individual writepage calls */ @@ -495,8 +490,8 @@ xfs_submit_ioend( ioend->io_bio->bi_private = ioend; ioend->io_bio->bi_end_io = xfs_end_bio; - bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, - (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); + ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately @@ -567,8 +562,7 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ - bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, - (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); + ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); submit_bio(ioend->io_bio); ioend->io_bio = new; } @@ -777,7 +771,7 @@ xfs_map_cow( { struct xfs_inode *ip = XFS_I(inode); struct xfs_bmbt_irec imap; - bool is_cow = false, need_alloc = false; + bool is_cow = false; int error; /* @@ -795,7 +789,7 @@ xfs_map_cow( * Else we need to check if there is a COW mapping at this offset. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc); + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!is_cow) @@ -805,7 +799,7 @@ xfs_map_cow( * And if the COW mapping has a delayed extent here we need to * allocate real space for it now. */ - if (need_alloc) { + if (isnullstartblock(imap.br_startblock)) { error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, &imap); if (error) @@ -1176,45 +1170,6 @@ xfs_vm_releasepage( } /* - * When we map a DIO buffer, we may need to pass flags to - * xfs_end_io_direct_write to tell it what kind of write IO we are doing. - * - * Note that for DIO, an IO to the highest supported file block offset (i.e. - * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 - * bit variable. Hence if we see this overflow, we have to assume that the IO is - * extending the file size. We won't know for sure until IO completion is run - * and the actual max write offset is communicated to the IO completion - * routine. - */ -static void -xfs_map_direct( - struct inode *inode, - struct buffer_head *bh_result, - struct xfs_bmbt_irec *imap, - xfs_off_t offset, - bool is_cow) -{ - uintptr_t *flags = (uintptr_t *)&bh_result->b_private; - xfs_off_t size = bh_result->b_size; - - trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, - ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : - XFS_IO_OVERWRITE, imap); - - if (ISUNWRITTEN(imap)) { - *flags |= XFS_DIO_FLAG_UNWRITTEN; - set_buffer_defer_completion(bh_result); - } else if (is_cow) { - *flags |= XFS_DIO_FLAG_COW; - set_buffer_defer_completion(bh_result); - } - if (offset + size > i_size_read(inode) || offset + size < 0) { - *flags |= XFS_DIO_FLAG_APPEND; - set_buffer_defer_completion(bh_result); - } -} - -/* * If this is O_DIRECT or the mpage code calling tell them how large the mapping * is, so that we can avoid repeated get_blocks calls. * @@ -1254,52 +1209,12 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } -/* Bounce unaligned directio writes to the page cache. */ static int -xfs_bounce_unaligned_dio_write( - struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap) -{ - struct xfs_bmbt_irec irec; - xfs_fileoff_t delta; - bool shared; - bool x; - int error; - - irec = *imap; - if (offset_fsb > irec.br_startoff) { - delta = offset_fsb - irec.br_startoff; - irec.br_blockcount -= delta; - irec.br_startblock += delta; - irec.br_startoff = offset_fsb; - } - error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); - if (error) - return error; - - /* - * We're here because we're trying to do a directio write to a - * region that isn't aligned to a filesystem block. If any part - * of the extent is shared, fall back to buffered mode to handle - * the RMW. This is done by returning -EREMCHG ("remote addr - * changed"), which is caught further up the call stack. - */ - if (shared) { - trace_xfs_reflink_bounce_dio_write(ip, imap); - return -EREMCHG; - } - return 0; -} - -STATIC int -__xfs_get_blocks( +xfs_get_blocks( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, - int create, - bool direct, - bool dax_fault) + int create) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1310,11 +1225,8 @@ __xfs_get_blocks( int nimaps = 1; xfs_off_t offset; ssize_t size; - int new = 0; - bool is_cow = false; - bool need_alloc = false; - BUG_ON(create && !direct); + BUG_ON(create); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1323,7 +1235,7 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && offset >= i_size_read(inode)) + if (offset >= i_size_read(inode)) return 0; /* @@ -1338,52 +1250,12 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - if (create && direct && xfs_is_reflink_inode(ip)) - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, - &need_alloc); - if (!is_cow) { - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); - /* - * Truncate an overwrite extent if there's a pending CoW - * reservation before the end of this extent. This - * forces us to come back to get_blocks to take care of - * the CoW. - */ - if (create && direct && nimaps && - imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(&imap)) - xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, - &imap); - } - ASSERT(!need_alloc); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) goto out_unlock; - /* for DAX, we convert unwritten extents directly */ - if (create && - (!nimaps || - (imap.br_startblock == HOLESTARTBLOCK || - imap.br_startblock == DELAYSTARTBLOCK) || - (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; - - trace_xfs_get_blocks_alloc(ip, offset, size, - ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN - : XFS_IO_DELALLOC, &imap); - } else if (nimaps) { + if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); @@ -1393,12 +1265,6 @@ __xfs_get_blocks( goto out_unlock; } - if (IS_DAX(inode) && create) { - ASSERT(!ISUNWRITTEN(&imap)); - /* zeroing is not needed at a higher layer */ - new = 0; - } - /* trim mapping down to size requested */ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); @@ -1408,50 +1274,14 @@ __xfs_get_blocks( */ if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && - (create || !ISUNWRITTEN(&imap))) { - if (create && direct && !is_cow) { - error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, - &imap); - if (error) - return error; - } - + !ISUNWRITTEN(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); - if (ISUNWRITTEN(&imap)) - set_buffer_unwritten(bh_result); - /* direct IO needs special help */ - if (create) { - if (dax_fault) - ASSERT(!ISUNWRITTEN(&imap)); - else - xfs_map_direct(inode, bh_result, &imap, offset, - is_cow); - } - } /* * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ bh_result->b_bdev = xfs_find_bdev_for_inode(inode); - - /* - * If we previously allocated a block out beyond eof and we are now - * coming back to use it then we will need to flag it as new even if it - * has a disk address. - * - * With sub-block writes into unwritten extents we also need to mark - * the buffer as new so that the unwritten parts of the buffer gets - * correctly zeroed. - */ - if (create && - ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || - (offset >= i_size_read(inode)) || - (new || ISUNWRITTEN(&imap)))) - set_buffer_new(bh_result); - - BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); - return 0; out_unlock: @@ -1459,110 +1289,6 @@ out_unlock: return error; } -int -xfs_get_blocks( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); -} - -int -xfs_get_blocks_direct( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); -} - -int -xfs_get_blocks_dax_fault( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); -} - -/* - * Complete a direct I/O write request. - * - * xfs_map_direct passes us some flags in the private data to tell us what to - * do. If no flags are set, then the write IO is an overwrite wholly within - * the existing allocated file size and so there is nothing for us to do. - * - * Note that in this case the completion can be called in interrupt context, - * whereas if we have flags set we will always be called in task context - * (i.e. from a workqueue). - */ -int -xfs_end_io_direct_write( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - uintptr_t flags = (uintptr_t)private; - int error = 0; - - trace_xfs_end_io_direct_write(ip, offset, size); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - if (size <= 0) - return size; - - /* - * The flags tell us whether we are doing unwritten extent conversions - * or an append transaction that updates the on-disk file size. These - * cases are the only cases where we should *potentially* be needing - * to update the VFS inode size. - */ - if (flags == 0) { - ASSERT(offset + size <= i_size_read(inode)); - return 0; - } - - /* - * We need to update the in-core inode size here so that we don't end up - * with the on-disk inode size being outside the in-core inode size. We - * have no other method of updating EOF for AIO, so always do it here - * if necessary. - * - * We need to lock the test/set EOF update as we can be racing with - * other IO completions here to update the EOF. Failing to serialise - * here can result in EOF moving backwards and Bad Things Happen when - * that occurs. - */ - spin_lock(&ip->i_flags_lock); - if (offset + size > i_size_read(inode)) - i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); - - if (flags & XFS_DIO_FLAG_COW) - error = xfs_reflink_end_cow(ip, offset, size); - if (flags & XFS_DIO_FLAG_UNWRITTEN) { - trace_xfs_end_io_direct_write_unwritten(ip, offset, size); - - error = xfs_iomap_write_unwritten(ip, offset, size); - } - if (flags & XFS_DIO_FLAG_APPEND) { - trace_xfs_end_io_direct_write_append(ip, offset, size); - - error = xfs_setfilesize(ip, offset, size); - } - - return error; -} - STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, @@ -1583,7 +1309,6 @@ xfs_vm_bmap( struct xfs_inode *ip = XFS_I(inode); trace_xfs_vm_bmap(XFS_I(inode)); - xfs_ilock(ip, XFS_IOLOCK_SHARED); /* * The swap code (ab-)uses ->bmap to get a block mapping and then @@ -1591,12 +1316,10 @@ xfs_vm_bmap( * that on reflinks inodes, so we have to skip out here. And yes, * 0 is the magic code for a bmap error.. */ - if (xfs_is_reflink_inode(ip)) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (xfs_is_reflink_inode(ip)) return 0; - } + filemap_write_and_wait(mapping); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index b3c6634f9518..cc174ec6c2fd 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -55,15 +55,6 @@ struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; -int xfs_get_blocks(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_get_blocks_direct(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); - -int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private); int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index e3da5d448bcf..d14691aa02b4 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,8 +112,8 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ -/* Return 0 on success, or -errno; other state communicated via *context */ -typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, +/* void; state communicated via *context */ +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); typedef struct xfs_attr_list_context { diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 25e76cd6c053..97c45b6eb91e 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -74,7 +74,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) xfs_attr_sf_entry_t *sfe; xfs_inode_t *dp; int sbsize, nsbuf, count, i; - int error; ASSERT(context != NULL); dp = context->dp; @@ -102,13 +101,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) (XFS_ISRESET_CURSOR(cursor) && (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - error = context->put_listent(context, - sfe->flags, - sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen); - if (error) - return error; + context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen); /* * Either search callback finished early or * didn't fit it all in the buffer after all. @@ -193,15 +190,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) cursor->hashval = sbp->hash; cursor->offset = 0; } - error = context->put_listent(context, - sbp->flags, - sbp->name, - sbp->namelen, - sbp->valuelen); - if (error) { - kmem_free(sbuf); - return error; - } + context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen); if (context->seen_enough) break; cursor->offset++; @@ -335,11 +328,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - error = xfs_attr3_leaf_list_int(bp, context); - if (error) { - xfs_trans_brelse(NULL, bp); - return error; - } + xfs_attr3_leaf_list_int(bp, context); xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; @@ -356,7 +345,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ -int +void xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) @@ -366,7 +355,6 @@ xfs_attr3_leaf_list_int( struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; struct xfs_attr_leaf_entry *entry; - int retval; int i; struct xfs_mount *mp = context->dp->i_mount; @@ -399,7 +387,7 @@ xfs_attr3_leaf_list_int( } if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return 0; + return; } } else { entry = &entries[0]; @@ -410,7 +398,6 @@ xfs_attr3_leaf_list_int( /* * We have found our place, start copying out the new attributes. */ - retval = 0; for (; i < ichdr.count; entry++, i++) { char *name; int namelen, valuelen; @@ -439,16 +426,14 @@ xfs_attr3_leaf_list_int( valuelen = be32_to_cpu(name_rmt->valuelen); } - retval = context->put_listent(context, entry->flags, + context->put_listent(context, entry->flags, name, namelen, valuelen); - if (retval) - break; if (context->seen_enough) break; cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return retval; + return; } /* @@ -467,9 +452,9 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) if (error) return error; - error = xfs_attr3_leaf_list_int(bp, context); + xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); - return error; + return 0; } int @@ -513,7 +498,7 @@ xfs_attr_list_int( * Take care to check values and protect against them changing later, * we may be reading them directly out of a user buffer. */ -STATIC int +STATIC void xfs_attr_put_listent( xfs_attr_list_context_t *context, int flags, @@ -536,10 +521,10 @@ xfs_attr_put_listent( */ if (((context->flags & ATTR_SECURE) == 0) != ((flags & XFS_ATTR_SECURE) == 0)) - return 0; + return; if (((context->flags & ATTR_ROOT) == 0) != ((flags & XFS_ATTR_ROOT) == 0)) - return 0; + return; arraytop = sizeof(*alist) + context->count * sizeof(alist->al_offset[0]); @@ -548,7 +533,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 0; + return; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; @@ -558,7 +543,7 @@ xfs_attr_put_listent( alist->al_offset[context->count++] = context->firstu; alist->al_count = context->count; trace_xfs_attr_list_add(context); - return 0; + return; } /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 552465e011ec..b9abce524c33 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -359,9 +359,7 @@ xfs_bmap_count_blocks( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count); + xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count); return 0; } @@ -426,7 +424,7 @@ xfs_getbmapx_fix_eof_hole( ifp = XFS_IFORK_PTR(ip, whichfork); if (!moretocome && xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && - (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + (lastx == xfs_iext_count(ifp) - 1)) out->bmv_oflags |= BMV_OF_LAST; } @@ -1792,6 +1790,7 @@ xfs_swap_extent_forks( struct xfs_ifork tempifp, *ifp, *tifp; int aforkblks = 0; int taforkblks = 0; + xfs_extnum_t nextents; __uint64_t tmp; int error; @@ -1877,14 +1876,13 @@ xfs_swap_extent_forks( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. + /* + * If the extents fit in the inode, fix the pointer. Otherwise + * it's already NULL or pointing to the extent. */ - if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { - ifp->if_u1.if_extents = - ifp->if_u2.if_inline_ext; - } + nextents = xfs_iext_count(&ip->i_df); + if (nextents <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: @@ -1896,14 +1894,13 @@ xfs_swap_extent_forks( switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. + /* + * If the extents fit in the inode, fix the pointer. Otherwise + * it's already NULL or pointing to the extent. */ - if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { - tifp->if_u1.if_extents = - tifp->if_u2.if_inline_ext; - } + nextents = xfs_iext_count(&tip->i_df); + if (nextents <= XFS_INLINE_EXTS) + tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: @@ -1938,8 +1935,8 @@ xfs_swap_extents( * page cache safely. Once we have done this we can take the ilocks and * do the rest of the checks. */ - lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); + lock_flags = XFS_MMAPLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ @@ -2079,15 +2076,13 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); +out_unlock: xfs_iunlock(ip, lock_flags); xfs_iunlock(tip, lock_flags); + unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); - -out_unlock: - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); - return error; + goto out_unlock; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b5b9bffe3520..7f0a01f7b592 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -219,7 +219,6 @@ _xfs_buf_alloc( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); - RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); @@ -473,6 +472,62 @@ _xfs_buf_map_pages( /* * Finding and Reading Buffers */ +static int +_xfs_buf_obj_cmp( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct xfs_buf_map *map = arg->key; + const struct xfs_buf *bp = obj; + + /* + * The key hashing in the lookup path depends on the key being the + * first element of the compare_arg, make sure to assert this. + */ + BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); + + if (bp->b_bn != map->bm_bn) + return 1; + + if (unlikely(bp->b_length != map->bm_len)) { + /* + * found a block number match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching for an exact match. + */ + ASSERT(bp->b_flags & XBF_STALE); + return 1; + } + return 0; +} + +static const struct rhashtable_params xfs_buf_hash_params = { + .min_size = 32, /* empty AGs have minimal footprint */ + .nelem_hint = 16, + .key_len = sizeof(xfs_daddr_t), + .key_offset = offsetof(struct xfs_buf, b_bn), + .head_offset = offsetof(struct xfs_buf, b_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = _xfs_buf_obj_cmp, +}; + +int +xfs_buf_hash_init( + struct xfs_perag *pag) +{ + spin_lock_init(&pag->pag_buf_lock); + return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); +} + +void +xfs_buf_hash_destroy( + struct xfs_perag *pag) +{ + rhashtable_destroy(&pag->pag_buf_hash); +} /* * Look up, and creates if absent, a lockable buffer for @@ -488,27 +543,24 @@ _xfs_buf_find( xfs_buf_t *new_bp) { struct xfs_perag *pag; - struct rb_node **rbp; - struct rb_node *parent; xfs_buf_t *bp; - xfs_daddr_t blkno = map[0].bm_bn; + struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; xfs_daddr_t eofs; - int numblks = 0; int i; for (i = 0; i < nmaps; i++) - numblks += map[i].bm_len; + cmap.bm_len += map[i].bm_len; /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (blkno < 0 || blkno >= eofs) { + if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { /* * XXX (dgc): we should really be returning -EFSCORRUPTED here, * but none of the higher level infrastructure supports @@ -516,53 +568,29 @@ _xfs_buf_find( */ xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", - __func__, blkno, eofs); + __func__, cmap.bm_bn, eofs); WARN_ON(1); return NULL; } - /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, blkno)); + xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); - /* walk tree */ spin_lock(&pag->pag_buf_lock); - rbp = &pag->pag_buf_tree.rb_node; - parent = NULL; - bp = NULL; - while (*rbp) { - parent = *rbp; - bp = rb_entry(parent, struct xfs_buf, b_rbnode); - - if (blkno < bp->b_bn) - rbp = &(*rbp)->rb_left; - else if (blkno > bp->b_bn) - rbp = &(*rbp)->rb_right; - else { - /* - * found a block number match. If the range doesn't - * match, the only way this is allowed is if the buffer - * in the cache is stale and the transaction that made - * it stale has not yet committed. i.e. we are - * reallocating a busy extent. Skip this buffer and - * continue searching to the right for an exact match. - */ - if (bp->b_length != numblks) { - ASSERT(bp->b_flags & XBF_STALE); - rbp = &(*rbp)->rb_right; - continue; - } - atomic_inc(&bp->b_hold); - goto found; - } + bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, + xfs_buf_hash_params); + if (bp) { + atomic_inc(&bp->b_hold); + goto found; } /* No match found */ if (new_bp) { - rb_link_node(&new_bp->b_rbnode, parent, rbp); - rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); /* the buffer keeps the perag reference until it is freed */ new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, + &new_bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); } else { XFS_STATS_INC(btp->bt_mount, xb_miss_locked); @@ -930,7 +958,6 @@ xfs_buf_rele( if (!pag) { ASSERT(list_empty(&bp->b_lru)); - ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) { xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); @@ -938,8 +965,6 @@ xfs_buf_rele( return; } - ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); - ASSERT(atomic_read(&bp->b_hold) > 0); release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); @@ -983,7 +1008,8 @@ xfs_buf_rele( } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); freebuf = true; @@ -1304,7 +1330,7 @@ _xfs_buf_ioapply( if (bp->b_flags & XBF_WRITE) { op = REQ_OP_WRITE; if (bp->b_flags & XBF_SYNCIO) - op_flags = WRITE_SYNC; + op_flags = REQ_SYNC; if (bp->b_flags & XBF_FUA) op_flags |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) @@ -1711,8 +1737,7 @@ xfs_free_buftarg( percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_blkdev_issue_flush(btp); + xfs_blkdev_issue_flush(btp); kmem_free(btp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1c2e52b2d926..8a9d3a9599f0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -71,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_NO_IOACCT, "NO_IOACCT" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ @@ -150,7 +151,7 @@ typedef struct xfs_buf { * which is the only bit that is touched if we hit the semaphore * fast-path on locking. */ - struct rb_node b_rbnode; /* rbtree node */ + struct rhash_head b_rhash_head; /* pag buffer hash node */ xfs_daddr_t b_bn; /* block number of buffer */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 29816981b50a..003a99b83bd8 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -677,7 +677,6 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; - xfs_ilock(dp, XFS_IOLOCK_SHARED); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) @@ -686,7 +685,6 @@ xfs_readdir( rval = xfs_dir2_block_getdents(&args, ctx); else rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return rval; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6e4f7f900fea..65d27a502909 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -48,40 +48,6 @@ static const struct vm_operations_struct xfs_file_vm_ops; /* - * Locking primitives for read and write IO paths to ensure we consistently use - * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. - */ -static inline void -xfs_rw_ilock( - struct xfs_inode *ip, - int type) -{ - if (type & XFS_IOLOCK_EXCL) - inode_lock(VFS_I(ip)); - xfs_ilock(ip, type); -} - -static inline void -xfs_rw_iunlock( - struct xfs_inode *ip, - int type) -{ - xfs_iunlock(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -static inline void -xfs_rw_ilock_demote( - struct xfs_inode *ip, - int type) -{ - xfs_ilock_demote(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -/* * Clear the specified ranges to zero through either the pagecache or DAX. * Holes and unwritten extents will be left as-is as they already are zeroed. */ @@ -183,19 +149,16 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); - if (mp->m_flags & XFS_MOUNT_BARRIER) { - /* - * If we have an RT and/or log subvolume we need to make sure - * to flush the write cache the device used for file data - * first. This is to ensure newly written file data make - * it to disk before logging the new inode size in case of - * an extending write. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); - else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); - } + /* + * If we have an RT and/or log subvolume we need to make sure to flush + * the write cache the device used for file data first. This is to + * ensure newly written file data make it to disk before logging the new + * inode size in case of an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); /* * All metadata updates are logged, which means that we just have to @@ -230,10 +193,8 @@ xfs_file_fsync( * an already allocated file and thus do not have any metadata to * commit. */ - if ((mp->m_flags & XFS_MOUNT_BARRIER) && - mp->m_logdev_targp == mp->m_ddev_targp && - !XFS_IS_REALTIME_INODE(ip) && - !log_flushed) + if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && + mp->m_logdev_targp == mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); return error; @@ -244,62 +205,21 @@ xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - loff_t isize = i_size_read(inode); + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); size_t count = iov_iter_count(to); - loff_t end = iocb->ki_pos + count - 1; - struct iov_iter data; - struct xfs_buftarg *target; - ssize_t ret = 0; + ssize_t ret; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ - if (XFS_IS_REALTIME_INODE(ip)) - target = ip->i_mount->m_rtdev_targp; - else - target = ip->i_mount->m_ddev_targp; - - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } - file_accessed(iocb->ki_filp); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out_unlock; - - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); - data = *to; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); - if (ret >= 0) { - iocb->ki_pos += ret; - iov_iter_advance(to, ret); - } - -out_unlock: - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -317,9 +237,9 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); return ret; @@ -335,9 +255,9 @@ xfs_file_buffered_aio_read( trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -418,15 +338,18 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock, true); + error = xfs_break_layouts(inode, iolock); if (error) return error; - /* For changing security info in file_remove_privs() we need i_mutex */ + /* + * For changing security info in file_remove_privs() we need i_rwsem + * exclusively. + */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); goto restart; } /* @@ -451,9 +374,9 @@ restart: spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } /* @@ -496,6 +419,58 @@ restart: return 0; } +static int +xfs_dio_write_end_io( + struct kiocb *iocb, + ssize_t size, + unsigned flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + loff_t offset = iocb->ki_pos; + bool update_size = false; + int error = 0; + + trace_xfs_end_io_direct_write(ip, offset, size); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (size <= 0) + return size; + + /* + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); + if (offset + size > i_size_read(inode)) { + i_size_write(inode, offset + size); + update_size = true; + } + spin_unlock(&ip->i_flags_lock); + + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + if (flags & IOMAP_DIO_UNWRITTEN) + error = xfs_iomap_write_unwritten(ip, offset, size); + else if (update_size) + error = xfs_setfilesize(ip, offset, size); + + return error; +} + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -535,9 +510,7 @@ xfs_file_dio_aio_write( int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); - loff_t end; - struct iov_iter data; - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ @@ -559,29 +532,12 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); - end = iocb->ki_pos + count - 1; - - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out; - - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } /* * If we are doing unaligned IO, wait for all other IO to drain, @@ -591,7 +547,7 @@ xfs_file_dio_aio_write( if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -604,24 +560,9 @@ xfs_file_dio_aio_write( goto out; } - data = *from; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - - /* see generic_file_direct_write() for why this is necessary */ - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } - - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(from, ret); - } + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); /* * No fallback to buffered IO on errors for XFS, direct IO will either @@ -643,7 +584,7 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -652,15 +593,13 @@ xfs_file_dax_write( count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); - - ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } - out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return error ? error : ret; } @@ -677,7 +616,7 @@ xfs_file_buffered_aio_write( int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -721,7 +660,7 @@ write_retry: current->backing_dev_info = NULL; out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return ret; } @@ -797,7 +736,7 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; @@ -939,7 +878,6 @@ xfs_file_clone_range( len, false); } -#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) STATIC ssize_t xfs_file_dedupe_range( struct file *src_file, @@ -950,14 +888,6 @@ xfs_file_dedupe_range( { int error; - /* - * Limit the total length we will dedupe for each operation. - * This is intended to bound the total time spent in this - * ioctl to something sane. - */ - if (len > XFS_MAX_DEDUPE_LEN) - len = XFS_MAX_DEDUPE_LEN; - error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) @@ -1474,7 +1404,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1501,15 +1431,9 @@ xfs_filemap_fault( return xfs_filemap_page_mkwrite(vma, vmf); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) { - /* - * we do not want to trigger unwritten extent conversion on read - * faults - that is unnecessary overhead and would also require - * changes to xfs_get_blocks_direct() to map unwritten extent - * ioend for conversion on read-only mappings. - */ - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); - } else + if (IS_DAX(inode)) + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); + else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1545,7 +1469,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); + ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f295049db681..ff4d6311c7f4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -70,8 +70,6 @@ xfs_inode_alloc( ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - /* initialise the xfs inode */ ip->i_ino = ino; ip->i_mount = mp; @@ -123,7 +121,6 @@ __xfs_inode_free( { /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); XFS_STATS_DEC(ip->i_mount, vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); @@ -133,6 +130,8 @@ void xfs_inode_free( struct xfs_inode *ip) { + ASSERT(!xfs_isiflocked(ip)); + /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -393,8 +392,8 @@ xfs_iget_cache_hit( xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + init_rwsem(&inode->i_rwsem); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); @@ -981,6 +980,7 @@ restart: if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip, false); goto reclaim; } @@ -989,10 +989,10 @@ restart: goto out_ifunlock; xfs_iunpin_wait(ip); } - if (xfs_iflags_test(ip, XFS_ISTALE)) - goto reclaim; - if (xfs_inode_clean(ip)) + if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { + xfs_ifunlock(ip); goto reclaim; + } /* * Never flush out dirty data during non-blocking reclaim, as it would @@ -1030,25 +1030,24 @@ restart: xfs_buf_relse(bp); } - xfs_iflock(ip); reclaim: + ASSERT(!xfs_isiflocked(ip)); + /* * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. - * We do this as early as possible under the ILOCK and flush lock so - * that xfs_iflush_cluster() can be guaranteed to detect races with us - * here. By doing this, we guarantee that once xfs_iflush_cluster has - * locked both the XFS_ILOCK and the flush lock that it will see either - * a valid, flushable inode that will serialise correctly against the - * locks below, or it will see a clean (and invalid) inode that it can - * skip. + * We do this as early as possible under the ILOCK so that + * xfs_iflush_cluster() can be guaranteed to detect races with us here. + * By doing this, we guarantee that once xfs_iflush_cluster has locked + * XFS_ILOCK that it will see either a valid, flushable inode that will + * serialise correctly, or it will see a clean (and invalid) inode that + * it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); @@ -1580,10 +1579,15 @@ xfs_inode_free_cowblocks( struct xfs_eofblocks *eofb = args; bool need_iolock = true; int match; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - if (!xfs_reflink_has_real_cow_blocks(ip)) { + /* + * Just clear the tag if we have an empty cow fork or none at all. It's + * possible the inode was fully unshared since it was originally tagged. + */ + if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) { trace_xfs_inode_free_cowblocks_invalid(ip); xfs_inode_clear_cowblocks_tag(ip); return 0; diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index d45ca72af6fb..865ad1373e5e 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -133,7 +133,7 @@ xfs_icreate_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_icreate_item_ops = { +static const struct xfs_item_ops xfs_icreate_item_ops = { .iop_size = xfs_icreate_item_size, .iop_format = xfs_icreate_item_format, .iop_pin = xfs_icreate_item_pin, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4e560e6a12c1..b9557795eb74 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and - * the i_lock. This routine allows various combinations of the locks to be - * obtained. + * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 + * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, * the mmap lock second and the ilock last in order to prevent deadlock. * * Basic locking order: * - * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock * * mmap_sem locking order: * - * i_iolock -> page lock -> mmap_sem + * i_rwsem -> page lock -> mmap_sem * mmap_sem -> i_mmap_lock -> page_lock * * The difference in mmap_sem locking order mean that we cannot hold the * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can * fault in pages during copy in/out (for buffered IO) or require the mmap_sem * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because * page faults already hold the mmap_sem. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both * taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). @@ -191,10 +191,13 @@ xfs_ilock( (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_IOLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_IOLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_MMAPLOCK_EXCL) mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); @@ -240,10 +243,10 @@ xfs_ilock_nowait( ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) + if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) goto out; } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) + if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) goto out; } @@ -271,9 +274,9 @@ out_undo_mmaplock: mrunlock_shared(&ip->i_mmaplock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); out: return 0; } @@ -310,9 +313,9 @@ xfs_iunlock( ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) mrunlock_excl(&ip->i_mmaplock); @@ -345,7 +348,7 @@ xfs_ilock_demote( if (lock_flags & XFS_MMAPLOCK_EXCL) mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); + downgrade_write(&VFS_I(ip)->i_rwsem); trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } @@ -370,8 +373,9 @@ xfs_isilocked( if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); + return !debug_locks || + lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); + return rwsem_is_locked(&VFS_I(ip)->i_rwsem); } ASSERT(0); @@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); - ASSERT(xfs_lockdep_subclass_ok(subclass + - XFS_IOLOCK_PARENT_VAL)); class += subclass << XFS_IOLOCK_SHIFT; - if (lock_mode & XFS_IOLOCK_PARENT) - class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; } if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { @@ -477,8 +477,6 @@ xfs_lock_inodes( XFS_ILOCK_EXCL)); ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | XFS_ILOCK_SHARED))); - ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) || - inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || @@ -581,10 +579,8 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { - ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); - ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); @@ -715,7 +711,6 @@ xfs_lookup( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - xfs_ilock(dp, XFS_IOLOCK_SHARED); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) goto out_unlock; @@ -724,14 +719,12 @@ xfs_lookup( if (error) goto out_free_name; - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return 0; out_free_name: if (ci_name) kmem_free(ci_name->name); out_unlock: - xfs_iunlock(dp, XFS_IOLOCK_SHARED); *ipp = NULL; return error; } @@ -1215,8 +1208,7 @@ xfs_create( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; xfs_defer_init(&dfops, &first_block); @@ -1252,7 +1244,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1325,7 +1317,7 @@ xfs_create( xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -1466,11 +1458,10 @@ xfs_link( if (error) goto std_return; - xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow hard link @@ -2041,7 +2032,6 @@ xfs_iunlink( agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); return 0; @@ -2133,7 +2123,6 @@ xfs_iunlink_remove( agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); } else { @@ -2579,10 +2568,9 @@ xfs_remove( goto std_return; } - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* @@ -2963,12 +2951,6 @@ xfs_rename( * whether the target directory is the same as the source * directory, we can lock from 2 to 4 inodes. */ - if (!new_parent) - xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - else - xfs_lock_two_inodes(src_dp, target_dp, - XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* @@ -2976,9 +2958,9 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f14c1de2549d..10dcf27b4c85 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -56,7 +56,6 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_iolock; /* inode IO lock */ mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ spinlock_t i_flags_lock; /* inode i_flags lock */ @@ -246,6 +245,11 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) * Synchronize processes attempting to flush the in-core inode back to disk. */ +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + extern void __xfs_iflock(struct xfs_inode *ip); static inline int xfs_iflock_nowait(struct xfs_inode *ip) @@ -261,16 +265,12 @@ static inline void xfs_iflock(struct xfs_inode *ip) static inline void xfs_ifunlock(struct xfs_inode *ip) { + ASSERT(xfs_isiflocked(ip)); xfs_iflags_clear(ip, XFS_IFLOCK); smp_mb(); wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); } -static inline int xfs_isiflocked(struct xfs_inode *ip) -{ - return xfs_iflags_test(ip, XFS_IFLOCK); -} - /* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) @@ -332,7 +332,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * IOLOCK values * * 0-3 subclass value - * 4-7 PARENT subclass values + * 4-7 unused * * MMAPLOCK values * @@ -347,10 +347,8 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * */ #define XFS_IOLOCK_SHIFT 16 -#define XFS_IOLOCK_PARENT_VAL 4 -#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1) +#define XFS_IOLOCK_MAX_SUBCLASS 3 #define XFS_IOLOCK_DEP_MASK 0x000f0000 -#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT) #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 9610e9c00952..d90e7811ccdd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -164,7 +164,7 @@ xfs_inode_item_format_data_fork( struct xfs_bmbt_rec *p; ASSERT(ip->i_df.if_u1.if_extents != NULL); - ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); + ASSERT(xfs_iext_count(&ip->i_df) > 0); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); @@ -261,7 +261,7 @@ xfs_inode_item_format_attr_fork( ip->i_afp->if_bytes > 0) { struct xfs_bmbt_rec *p; - ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == + ASSERT(xfs_iext_count(ip->i_afp) == ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c245bed3249b..fc563b82aea6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -639,7 +639,7 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; @@ -910,16 +910,14 @@ xfs_ioc_fsgetxattr( if (attr) { if (ip->i_afp) { if (ip->i_afp->if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = ip->i_afp->if_bytes / - sizeof(xfs_bmbt_rec_t); + fa.fsx_nextents = xfs_iext_count(ip->i_afp); else fa.fsx_nextents = ip->i_d.di_anextents; } else fa.fsx_nextents = 0; } else { if (ip->i_df.if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = ip->i_df.if_bytes / - sizeof(xfs_bmbt_rec_t); + fa.fsx_nextents = xfs_iext_count(&ip->i_df); else fa.fsx_nextents = ip->i_d.di_nextents; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 436e109bb01e..0d147428971e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -395,11 +395,12 @@ xfs_iomap_prealloc_size( struct xfs_inode *ip, loff_t offset, loff_t count, - xfs_extnum_t idx, - struct xfs_bmbt_irec *prev) + xfs_extnum_t idx) { struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + struct xfs_bmbt_irec prev; int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; @@ -419,8 +420,8 @@ xfs_iomap_prealloc_size( */ if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || - idx == 0 || - prev->br_startoff + prev->br_blockcount < offset_fsb) + !xfs_iext_get_extent(ifp, idx - 1, &prev) || + prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_writeio_blocks; /* @@ -439,8 +440,8 @@ xfs_iomap_prealloc_size( * always extends to MAXEXTLEN rather than falling short due to things * like stripe unit/width alignment of real extents. */ - if (prev->br_blockcount <= (MAXEXTLEN >> 1)) - alloc_blocks = prev->br_blockcount << 1; + if (prev.br_blockcount <= (MAXEXTLEN >> 1)) + alloc_blocks = prev.br_blockcount << 1; else alloc_blocks = XFS_B_TO_FSB(mp, offset); if (!alloc_blocks) @@ -535,11 +536,11 @@ xfs_file_iomap_begin_delay( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - xfs_fileoff_t end_fsb, orig_end_fsb; + xfs_fileoff_t end_fsb; int error = 0, eof = 0; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec prev; xfs_extnum_t idx; + xfs_fsblock_t prealloc_blocks = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -563,8 +564,7 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, - &got, &prev); + eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); if (!eof && got.br_startoff <= offset_fsb) { if (xfs_is_reflink_inode(ip)) { bool shared; @@ -595,35 +595,32 @@ xfs_file_iomap_begin_delay( * the lower level functions are updated. */ count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = orig_end_fsb = - min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); if (eof) { - xfs_fsblock_t prealloc_blocks; - - prealloc_blocks = - xfs_iomap_prealloc_size(ip, offset, count, idx, &prev); + prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; + xfs_fileoff_t p_end_fsb; end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); - end_fsb = XFS_B_TO_FSBT(mp, end_offset) + - prealloc_blocks; + p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + + prealloc_blocks; align = xfs_eof_alignment(ip, 0); if (align) - end_fsb = roundup_64(end_fsb, align); + p_end_fsb = roundup_64(p_end_fsb, align); - end_fsb = min(end_fsb, maxbytes_fsb); - ASSERT(end_fsb > offset_fsb); + p_end_fsb = min(p_end_fsb, maxbytes_fsb); + ASSERT(p_end_fsb > offset_fsb); + prealloc_blocks = p_end_fsb - end_fsb; } } retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, &got, - &prev, &idx, eof); + end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof); switch (error) { case 0: break; @@ -631,8 +628,8 @@ retry: case -EDQUOT: /* retry without any preallocation */ trace_xfs_delalloc_enospc(ip, offset, count); - if (end_fsb != orig_end_fsb) { - end_fsb = orig_end_fsb; + if (prealloc_blocks) { + prealloc_blocks = 0; goto retry; } /*FALLTHRU*/ @@ -640,13 +637,6 @@ retry: goto out_unlock; } - /* - * Tag the inode as speculatively preallocated so we can reclaim this - * space on demand, if necessary. - */ - if (end_fsb != orig_end_fsb) - xfs_inode_set_eofblocks_tag(ip); - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -960,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && ISUNWRITTEN(imap)); } +static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) +{ + /* + * COW writes will allocate delalloc space, so we need to make sure + * to take the lock exclusively here. + */ + if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) + return true; + if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE)) + return true; + return false; +} + static int xfs_file_iomap_begin( struct inode *inode, @@ -979,18 +982,14 @@ xfs_file_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && - !xfs_get_extsz_hint(ip)) { + if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && + !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ return xfs_file_iomap_begin_delay(inode, offset, length, flags, iomap); } - /* - * COW writes will allocate delalloc space, so we need to make sure - * to take the lock exclusively here. - */ - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + if (need_excl_ilock(ip, flags)) { lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); } else { @@ -1003,17 +1002,41 @@ xfs_file_iomap_begin( offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); + if (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) { + shared = xfs_reflink_find_cow_mapping(ip, offset, &imap); + if (shared) { + xfs_iunlock(ip, lockmode); + goto alloc_done; + } + ASSERT(!isnullstartblock(imap.br_startblock)); + } + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; - if (flags & IOMAP_REPORT) { + if ((flags & IOMAP_REPORT) || + (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) { /* Trim the mapping to the nearest shared extent boundary. */ error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) goto out_unlock; + + /* + * We're here because we're trying to do a directio write to a + * region that isn't aligned to a filesystem block. If the + * extent is shared, fall back to buffered mode to handle the + * RMW. + */ + if (!(flags & IOMAP_REPORT) && shared) { + trace_xfs_reflink_bounce_dio_write(ip, &imap); + error = -EREMCHG; + goto out_unlock; + } } if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { @@ -1048,6 +1071,7 @@ xfs_file_iomap_begin( if (error) return error; +alloc_done: iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 405a65cd9d6b..b930be0b1596 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -983,15 +983,13 @@ xfs_vn_setattr( struct xfs_inode *ip = XFS_I(d_inode(dentry)); uint iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); - error = xfs_break_layouts(d_inode(dentry), &iolock, true); - if (!error) { - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; + error = xfs_break_layouts(d_inode(dentry), &iolock); + if (error) + return error; - error = xfs_vn_setattr_size(dentry, iattr); - } - xfs_iunlock(ip, iolock); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_vn_setattr_size(dentry, iattr); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { error = xfs_vn_setattr_nonsize(dentry, iattr); } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 68640fb63a54..a415f822f2c1 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -78,6 +78,7 @@ typedef __u32 xfs_nlink_t; #include <linux/freezer.h> #include <linux/list_sort.h> #include <linux/ratelimit.h> +#include <linux/rhashtable.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3b74fa011bb1..c39ac14ff540 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1668,7 +1668,7 @@ xlog_cksum( __uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum((char *)rhead, + crc = xfs_start_cksum_update((char *)rhead, sizeof(struct xlog_rec_header), offsetof(struct xlog_rec_header, h_crc)); @@ -1862,26 +1862,21 @@ xlog_sync( bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; - bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); + bp->b_flags &= ~XBF_FLUSH; + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { - bp->b_flags |= XBF_FUA; - - /* - * Flush the data device before flushing the log to make - * sure all meta data written back from the AIL actually made - * it to disk before stamping the new log tail LSN into the - * log buffer. For an external log we need to issue the - * flush explicitly, and unfortunately synchronously here; - * for an internal log we can simply use the block layer - * state machine for preflushes. - */ - if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - else - bp->b_flags |= XBF_FLUSH; - } + /* + * Flush the data device before flushing the log to make sure all meta + * data written back from the AIL actually made it to disk before + * stamping the new log tail LSN into the log buffer. For an external + * log we need to issue the flush explicitly, and unfortunately + * synchronously here; for an internal log we can simply use the block + * layer state machine for preflushes. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + else + bp->b_flags |= XBF_FLUSH; ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1906,10 +1901,8 @@ xlog_sync( xfs_buf_associate_memory(bp, (char *)&iclog->ic_header + count, split); bp->b_fspriv = iclog; - bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) - bp->b_flags |= XBF_FUA; + bp->b_flags &= ~XBF_FLUSH; + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9b3d7c76915d..4a98762ec8b4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2025,7 +2025,7 @@ xlog_peek_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct list_head *bucket; struct xfs_buf_cancel *bcp; @@ -2065,7 +2065,7 @@ xlog_check_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct xfs_buf_cancel *bcp; @@ -5113,19 +5113,21 @@ xlog_recover_process( struct list_head *buffer_list) { int error; + __le32 old_crc = rhead->h_crc; __le32 crc; + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets h_crc to 0 so we must consider this valid even on v5 supers. + * sets old_crc to 0 so we must consider this valid even on v5 supers. * Otherwise, return EFSBADCRC on failure so the callers up the stack * know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (rhead->h_crc && crc != rhead->h_crc) + if (old_crc && crc != old_crc) return -EFSBADCRC; return 0; } @@ -5136,11 +5138,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != rhead->h_crc) { - if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + if (crc != old_crc) { + if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(rhead->h_crc), + le32_to_cpu(old_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b341f10cf481..9b9540db17a6 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -157,6 +157,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -212,8 +213,8 @@ xfs_initialize_perag( spin_lock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - spin_lock_init(&pag->pag_buf_lock); - pag->pag_buf_tree = RB_ROOT; + if (xfs_buf_hash_init(pag)) + goto out_unwind; if (radix_tree_preload(GFP_NOFS)) goto out_unwind; @@ -239,9 +240,11 @@ xfs_initialize_perag( return 0; out_unwind: + xfs_buf_hash_destroy(pag); kmem_free(pag); for (; index > first_initialised; index--) { pag = radix_tree_delete(&mp->m_perag_tree, index); + xfs_buf_hash_destroy(pag); kmem_free(pag); } return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 819b80b15bfb..84f785218907 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -393,8 +393,8 @@ typedef struct xfs_perag { unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ - struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ + struct rhashtable pag_buf_hash; /* for rcu-safe freeing */ struct rcu_head rcu_head; @@ -424,6 +424,9 @@ xfs_perag_resv( } } +int xfs_buf_hash_init(xfs_perag_t *pag); +void xfs_buf_hash_destroy(xfs_perag_t *pag); + extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 93a7aafa56d6..2f2dc3c09ad0 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -32,8 +32,7 @@ int xfs_break_layouts( struct inode *inode, - uint *iolock, - bool with_imutex) + uint *iolock) { struct xfs_inode *ip = XFS_I(inode); int error; @@ -42,12 +41,8 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); - if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; - if (with_imutex) - inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index e8339f74966b..b587cb99b2b7 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); +int xfs_break_layouts(struct inode *inode, uint *iolock); #else static inline int -xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) +xfs_break_layouts(struct inode *inode, uint *iolock) { return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a60d9e2739d1..45e50ea90769 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1135,7 +1135,7 @@ xfs_qm_get_rtblks( return error; } rtblks = 0; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (idx = 0; idx < nextents; idx++) rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); *O_rtblks = (xfs_qcnt_t)rtblks; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index a279b4e7f5fe..88fd03c66e99 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -243,12 +243,11 @@ xfs_reflink_reserve_cow( struct xfs_bmbt_irec *imap, bool *shared) { - struct xfs_bmbt_irec got, prev; - xfs_fileoff_t end_fsb, orig_end_fsb; - int eof = 0, error = 0; - bool trimmed; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got; + int error = 0; + bool eof = false, trimmed; xfs_extnum_t idx; - xfs_extlen_t align; /* * Search the COW fork extent list first. This serves two purposes: @@ -258,8 +257,9 @@ xfs_reflink_reserve_cow( * extent list is generally faster than going out to the shared extent * tree. */ - xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx, - &got, &prev); + + if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got)) + eof = true; if (!eof && got.br_startoff <= imap->br_startoff) { trace_xfs_reflink_cow_found(ip, imap); xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); @@ -285,33 +285,12 @@ xfs_reflink_reserve_cow( if (error) return error; - end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount; - - align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); - if (align) - end_fsb = roundup_64(end_fsb, align); - -retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - end_fsb - imap->br_startoff, &got, &prev, &idx, eof); - switch (error) { - case 0: - break; - case -ENOSPC: - case -EDQUOT: - /* retry without any preallocation */ + imap->br_blockcount, 0, &got, &idx, eof); + if (error == -ENOSPC || error == -EDQUOT) trace_xfs_reflink_cow_enospc(ip, imap); - if (end_fsb != orig_end_fsb) { - end_fsb = orig_end_fsb; - goto retry; - } - /*FALLTHRU*/ - default: + if (error) return error; - } - - if (end_fsb != orig_end_fsb) - xfs_inode_set_cowblocks_tag(ip); trace_xfs_reflink_cow_alloc(ip, &got); return 0; @@ -418,87 +397,65 @@ xfs_reflink_allocate_cow_range( } /* - * Find the CoW reservation (and whether or not it needs block allocation) - * for a given byte offset of a file. + * Find the CoW reservation for a given byte offset of a file. */ bool xfs_reflink_find_cow_mapping( struct xfs_inode *ip, xfs_off_t offset, - struct xfs_bmbt_irec *imap, - bool *need_alloc) + struct xfs_bmbt_irec *imap) { - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; - xfs_fileoff_t bno; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_fileoff_t offset_fsb; + struct xfs_bmbt_irec got; xfs_extnum_t idx; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(xfs_is_reflink_inode(ip)); - /* Find the extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - bno = XFS_B_TO_FSBT(ip->i_mount, offset); - gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); - if (!gotp) + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return false; - - xfs_bmbt_get_all(gotp, &irec); - if (bno >= irec.br_startoff + irec.br_blockcount || - bno < irec.br_startoff) + if (got.br_startoff > offset_fsb) return false; trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, - &irec); - - /* If it's still delalloc, we must allocate later. */ - *imap = irec; - *need_alloc = !!(isnullstartblock(irec.br_startblock)); - + &got); + *imap = got; return true; } /* * Trim an extent to end at the next CoW reservation past offset_fsb. */ -int +void xfs_reflink_trim_irec_to_next_cow( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap) { - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got; xfs_extnum_t idx; if (!xfs_is_reflink_inode(ip)) - return 0; + return; /* Find the extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); - if (!gotp) - return 0; - xfs_bmbt_get_all(gotp, &irec); + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + return; /* This is the extent before; try sliding up one. */ - if (irec.br_startoff < offset_fsb) { - idx++; - if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - return 0; - gotp = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_get_all(gotp, &irec); + if (got.br_startoff < offset_fsb) { + if (!xfs_iext_get_extent(ifp, idx + 1, &got)) + return; } - if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) - return 0; + if (got.br_startoff >= imap->br_startoff + imap->br_blockcount) + return; - imap->br_blockcount = irec.br_startoff - imap->br_startoff; + imap->br_blockcount = got.br_startoff - imap->br_startoff; trace_xfs_reflink_trim_irec(ip, imap); - - return 0; } /* @@ -512,18 +469,15 @@ xfs_reflink_cancel_cow_blocks( xfs_fileoff_t end_fsb) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, prev, del; + struct xfs_bmbt_irec got, del; xfs_extnum_t idx; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error = 0, eof = 0; + int error = 0; if (!xfs_is_reflink_inode(ip)) return 0; - - xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx, - &got, &prev); - if (eof) + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return 0; while (got.br_startoff < end_fsb) { @@ -566,9 +520,8 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &idx, &got, &del); } - if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)) + if (!xfs_iext_get_extent(ifp, ++idx, &got)) break; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } /* clear tag if cow fork is emptied */ @@ -638,13 +591,13 @@ xfs_reflink_end_cow( xfs_off_t count) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, prev, del; + struct xfs_bmbt_irec got, del; struct xfs_trans *tp; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error, eof = 0; + int error; unsigned int resblks; xfs_filblks_t rlen; xfs_extnum_t idx; @@ -668,13 +621,11 @@ xfs_reflink_end_cow( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx, - &got, &prev); - /* If there is a hole at end_fsb - 1 go to the previous extent */ - if (eof || got.br_startoff > end_fsb) { + if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) || + got.br_startoff > end_fsb) { ASSERT(idx > 0); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); + xfs_iext_get_extent(ifp, --idx, &got); } /* Walk backwards until we're out of the I/O range... */ @@ -722,11 +673,9 @@ xfs_reflink_end_cow( error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto out_defer; - next_extent: - if (idx < 0) + if (!xfs_iext_get_extent(ifp, idx, &got)) break; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } error = xfs_trans_commit(tp); @@ -1302,13 +1251,11 @@ xfs_reflink_remap_range( return -EIO; /* Lock both files against IO */ - if (same_inode) { - xfs_ilock(src, XFS_IOLOCK_EXCL); + lock_two_nondirectories(inode_in, inode_out); + if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); - } else { - xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + else xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); - } /* Don't touch certain kinds of inodes */ ret = -EPERM; @@ -1345,8 +1292,14 @@ xfs_reflink_remap_range( goto out_unlock; } - if (len == 0) + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (len == 0) { + if (is_dedupe) { + ret = 0; + goto out_unlock; + } len = isize - pos_in; + } /* Ensure offsets don't wrap and the input is inside i_size */ if (pos_in + len < pos_in || pos_out + len < pos_out || @@ -1447,11 +1400,9 @@ xfs_reflink_remap_range( out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); - xfs_iunlock(src, XFS_IOLOCK_EXCL); - if (src->i_ino != dest->i_ino) { + if (!same_inode) xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - xfs_iunlock(dest, XFS_IOLOCK_EXCL); - } + unlock_two_nondirectories(inode_in, inode_out); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; @@ -1697,37 +1648,3 @@ out: trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; } - -/* - * Does this inode have any real CoW reservations? - */ -bool -xfs_reflink_has_real_cow_blocks( - struct xfs_inode *ip) -{ - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; - xfs_extnum_t idx; - - if (!xfs_is_reflink_inode(ip)) - return false; - - /* Go find the old extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); - while (gotp) { - xfs_bmbt_get_all(gotp, &irec); - - if (!isnullstartblock(irec.br_startblock)) - return true; - - /* Roll on... */ - idx++; - if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - break; - gotp = xfs_iext_get_ext(ifp, idx); - } - - return false; -} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index fad11607c9ad..aa6a4d64bd35 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -31,8 +31,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, - struct xfs_bmbt_irec *imap, bool *need_alloc); -extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, + struct xfs_bmbt_irec *imap); +extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, @@ -50,6 +50,4 @@ extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); -extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip); - #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 12d48cd8f8a4..f11282c96887 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -80,9 +80,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) } /* extra precision counters */ for_each_possible_cpu(i) { - xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes; - xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes; - xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes; + xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; + xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; + xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; } len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", @@ -106,9 +106,9 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) for_each_possible_cpu(c) { preempt_disable(); /* save vn_active, it's a universal truth! */ - vn_active = per_cpu_ptr(stats, c)->vn_active; + vn_active = per_cpu_ptr(stats, c)->s.vn_active; memset(per_cpu_ptr(stats, c), 0, sizeof(*stats)); - per_cpu_ptr(stats, c)->vn_active = vn_active; + per_cpu_ptr(stats, c)->s.vn_active = vn_active; preempt_enable(); } } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 79ad2e69fc33..375840f5a99a 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -22,9 +22,37 @@ #include <linux/percpu.h> /* + * The btree stats arrays have fixed offsets for the different stats. We + * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and + * that allows us to use fixed offsets into the stats array for each btree + * stat. These index offsets are defined in the order they will be emitted + * in the stats files, so it is possible to add new btree stat types by + * appending to the enum list below. + */ +enum { + __XBTS_lookup = 0, + __XBTS_compare = 1, + __XBTS_insrec = 2, + __XBTS_delrec = 3, + __XBTS_newroot = 4, + __XBTS_killroot = 5, + __XBTS_increment = 6, + __XBTS_decrement = 7, + __XBTS_lshift = 8, + __XBTS_rshift = 9, + __XBTS_split = 10, + __XBTS_join = 11, + __XBTS_alloc = 12, + __XBTS_free = 13, + __XBTS_moves = 14, + + __XBTS_MAX = 15, +}; + +/* * XFS global statistics */ -struct xfsstats { +struct __xfsstats { # define XFSSTAT_END_EXTENT_ALLOC 4 __uint32_t xs_allocx; __uint32_t xs_allocb; @@ -117,118 +145,20 @@ struct xfsstats { __uint32_t xb_page_found; __uint32_t xb_get_read; /* Version 2 btree counters */ -#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) - __uint32_t xs_abtb_2_lookup; - __uint32_t xs_abtb_2_compare; - __uint32_t xs_abtb_2_insrec; - __uint32_t xs_abtb_2_delrec; - __uint32_t xs_abtb_2_newroot; - __uint32_t xs_abtb_2_killroot; - __uint32_t xs_abtb_2_increment; - __uint32_t xs_abtb_2_decrement; - __uint32_t xs_abtb_2_lshift; - __uint32_t xs_abtb_2_rshift; - __uint32_t xs_abtb_2_split; - __uint32_t xs_abtb_2_join; - __uint32_t xs_abtb_2_alloc; - __uint32_t xs_abtb_2_free; - __uint32_t xs_abtb_2_moves; -#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) - __uint32_t xs_abtc_2_lookup; - __uint32_t xs_abtc_2_compare; - __uint32_t xs_abtc_2_insrec; - __uint32_t xs_abtc_2_delrec; - __uint32_t xs_abtc_2_newroot; - __uint32_t xs_abtc_2_killroot; - __uint32_t xs_abtc_2_increment; - __uint32_t xs_abtc_2_decrement; - __uint32_t xs_abtc_2_lshift; - __uint32_t xs_abtc_2_rshift; - __uint32_t xs_abtc_2_split; - __uint32_t xs_abtc_2_join; - __uint32_t xs_abtc_2_alloc; - __uint32_t xs_abtc_2_free; - __uint32_t xs_abtc_2_moves; -#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) - __uint32_t xs_bmbt_2_lookup; - __uint32_t xs_bmbt_2_compare; - __uint32_t xs_bmbt_2_insrec; - __uint32_t xs_bmbt_2_delrec; - __uint32_t xs_bmbt_2_newroot; - __uint32_t xs_bmbt_2_killroot; - __uint32_t xs_bmbt_2_increment; - __uint32_t xs_bmbt_2_decrement; - __uint32_t xs_bmbt_2_lshift; - __uint32_t xs_bmbt_2_rshift; - __uint32_t xs_bmbt_2_split; - __uint32_t xs_bmbt_2_join; - __uint32_t xs_bmbt_2_alloc; - __uint32_t xs_bmbt_2_free; - __uint32_t xs_bmbt_2_moves; -#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) - __uint32_t xs_ibt_2_lookup; - __uint32_t xs_ibt_2_compare; - __uint32_t xs_ibt_2_insrec; - __uint32_t xs_ibt_2_delrec; - __uint32_t xs_ibt_2_newroot; - __uint32_t xs_ibt_2_killroot; - __uint32_t xs_ibt_2_increment; - __uint32_t xs_ibt_2_decrement; - __uint32_t xs_ibt_2_lshift; - __uint32_t xs_ibt_2_rshift; - __uint32_t xs_ibt_2_split; - __uint32_t xs_ibt_2_join; - __uint32_t xs_ibt_2_alloc; - __uint32_t xs_ibt_2_free; - __uint32_t xs_ibt_2_moves; -#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) - __uint32_t xs_fibt_2_lookup; - __uint32_t xs_fibt_2_compare; - __uint32_t xs_fibt_2_insrec; - __uint32_t xs_fibt_2_delrec; - __uint32_t xs_fibt_2_newroot; - __uint32_t xs_fibt_2_killroot; - __uint32_t xs_fibt_2_increment; - __uint32_t xs_fibt_2_decrement; - __uint32_t xs_fibt_2_lshift; - __uint32_t xs_fibt_2_rshift; - __uint32_t xs_fibt_2_split; - __uint32_t xs_fibt_2_join; - __uint32_t xs_fibt_2_alloc; - __uint32_t xs_fibt_2_free; - __uint32_t xs_fibt_2_moves; -#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15) - __uint32_t xs_rmap_2_lookup; - __uint32_t xs_rmap_2_compare; - __uint32_t xs_rmap_2_insrec; - __uint32_t xs_rmap_2_delrec; - __uint32_t xs_rmap_2_newroot; - __uint32_t xs_rmap_2_killroot; - __uint32_t xs_rmap_2_increment; - __uint32_t xs_rmap_2_decrement; - __uint32_t xs_rmap_2_lshift; - __uint32_t xs_rmap_2_rshift; - __uint32_t xs_rmap_2_split; - __uint32_t xs_rmap_2_join; - __uint32_t xs_rmap_2_alloc; - __uint32_t xs_rmap_2_free; - __uint32_t xs_rmap_2_moves; -#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15) - __uint32_t xs_refcbt_2_lookup; - __uint32_t xs_refcbt_2_compare; - __uint32_t xs_refcbt_2_insrec; - __uint32_t xs_refcbt_2_delrec; - __uint32_t xs_refcbt_2_newroot; - __uint32_t xs_refcbt_2_killroot; - __uint32_t xs_refcbt_2_increment; - __uint32_t xs_refcbt_2_decrement; - __uint32_t xs_refcbt_2_lshift; - __uint32_t xs_refcbt_2_rshift; - __uint32_t xs_refcbt_2_split; - __uint32_t xs_refcbt_2_join; - __uint32_t xs_refcbt_2_alloc; - __uint32_t xs_refcbt_2_free; - __uint32_t xs_refcbt_2_moves; +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) + __uint32_t xs_abtb_2[__XBTS_MAX]; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) + __uint32_t xs_abtc_2[__XBTS_MAX]; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) + __uint32_t xs_bmbt_2[__XBTS_MAX]; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) + __uint32_t xs_ibt_2[__XBTS_MAX]; +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) + __uint32_t xs_fibt_2[__XBTS_MAX]; +#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) + __uint32_t xs_rmap_2[__XBTS_MAX]; +#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) + __uint32_t xs_refcbt_2[__XBTS_MAX]; #define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; @@ -245,26 +175,58 @@ struct xfsstats { __uint64_t xs_read_bytes; }; +struct xfsstats { + union { + struct __xfsstats s; + uint32_t a[XFSSTAT_END_XQMSTAT]; + }; +}; + +/* + * simple wrapper for getting the array index of s struct member offset + */ +#define XFS_STATS_CALC_INDEX(member) \ + (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t)) + + int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); void xfs_stats_clearall(struct xfsstats __percpu *stats); extern struct xstats xfsstats; #define XFS_STATS_INC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++; \ } while (0) #define XFS_STATS_DEC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--; \ } while (0) #define XFS_STATS_ADD(mp, v, inc) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc); \ +} while (0) + +#define XFS_STATS_INC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++; \ +} while (0) + +#define XFS_STATS_DEC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]; \ +} while (0) + +#define XFS_STATS_ADD_OFF(mp, off, inc) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc); \ } while (0) #if defined(CONFIG_PROC_FS) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ade4691e3f74..eecbaac08eba 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -104,9 +104,6 @@ static const match_table_t tokens = { {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */ {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */ {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */ - {Opt_barrier, "barrier"}, /* use writer barriers for log write and - * unwritten extent conversion */ - {Opt_nobarrier, "nobarrier"}, /* .. disable */ {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */ {Opt_inode32, "inode32"}, /* inode allocation limited to * XFS_MAXINUMBER_32 */ @@ -134,6 +131,12 @@ static const match_table_t tokens = { {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ + + /* Deprecated mount options scheduled for removal */ + {Opt_barrier, "barrier"}, /* use writer barriers for log write and + * unwritten extent conversion */ + {Opt_nobarrier, "nobarrier"}, /* .. disable */ + {Opt_err, NULL}, }; @@ -301,12 +304,6 @@ xfs_parseargs( case Opt_nouuid: mp->m_flags |= XFS_MOUNT_NOUUID; break; - case Opt_barrier: - mp->m_flags |= XFS_MOUNT_BARRIER; - break; - case Opt_nobarrier: - mp->m_flags &= ~XFS_MOUNT_BARRIER; - break; case Opt_ikeep: mp->m_flags |= XFS_MOUNT_IKEEP; break; @@ -374,6 +371,14 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DAX; break; #endif + case Opt_barrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); + mp->m_flags |= XFS_MOUNT_BARRIER; + break; + case Opt_nobarrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; default: xfs_warn(mp, "unknown mount option [%s].", p); return -EINVAL; @@ -943,7 +948,7 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); @@ -1238,9 +1243,11 @@ xfs_fs_remount( token = match_token(p, tokens, args); switch (token) { case Opt_barrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); mp->m_flags |= XFS_MOUNT_BARRIER; break; case Opt_nobarrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); mp->m_flags &= ~XFS_MOUNT_BARRIER; break; case Opt_inode64: diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 58142aeeeea6..f2cb45ed1d54 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -238,8 +238,7 @@ xfs_symlink( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; /* @@ -287,7 +286,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; /* @@ -412,7 +411,7 @@ out_release_inode: xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0907752be62d..69c5bcd9a51b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -355,7 +355,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_submit); DEFINE_BUF_EVENT(xfs_buf_submit_wait); -DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_trylock_fail); @@ -367,19 +366,15 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); -DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); -DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); /* not really buffer traces, but the buf provides useful information */ DEFINE_BUF_EVENT(xfs_btree_corrupt); -DEFINE_BUF_EVENT(xfs_da_btree_corrupt); DEFINE_BUF_EVENT(xfs_reset_dqcounts); -DEFINE_BUF_EVENT(xfs_inode_item_push); /* pass flags explicitly */ DECLARE_EVENT_CLASS(xfs_buf_flags_class, @@ -541,7 +536,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); -DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); DECLARE_EVENT_CLASS(xfs_filestream_class, TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), @@ -680,7 +674,6 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); -DEFINE_INODE_EVENT(xfs_evict_inode); DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); @@ -798,7 +791,6 @@ TRACE_EVENT(xfs_irec_merge_post, DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ TP_ARGS(ip, caller_ip)) -DEFINE_IREF_EVENT(xfs_ihold); DEFINE_IREF_EVENT(xfs_irele); DEFINE_IREF_EVENT(xfs_inode_pin); DEFINE_IREF_EVENT(xfs_inode_unpin); @@ -939,7 +931,6 @@ DEFINE_DQUOT_EVENT(xfs_dqget_miss); DEFINE_DQUOT_EVENT(xfs_dqget_freeing); DEFINE_DQUOT_EVENT(xfs_dqget_dup); DEFINE_DQUOT_EVENT(xfs_dqput); -DEFINE_DQUOT_EVENT(xfs_dqput_wait); DEFINE_DQUOT_EVENT(xfs_dqput_free); DEFINE_DQUOT_EVENT(xfs_dqrele); DEFINE_DQUOT_EVENT(xfs_dqflush); @@ -1815,7 +1806,6 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_addname); DEFINE_ATTR_EVENT(xfs_attr_sf_create); DEFINE_ATTR_EVENT(xfs_attr_sf_lookup); DEFINE_ATTR_EVENT(xfs_attr_sf_remove); -DEFINE_ATTR_EVENT(xfs_attr_sf_removename); DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); DEFINE_ATTR_EVENT(xfs_attr_leaf_add); @@ -1844,7 +1834,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); DEFINE_ATTR_EVENT(xfs_attr_node_get); -DEFINE_ATTR_EVENT(xfs_attr_node_lookup); DEFINE_ATTR_EVENT(xfs_attr_node_replace); DEFINE_ATTR_EVENT(xfs_attr_node_removename); @@ -2440,11 +2429,9 @@ DEFINE_DEFER_EVENT(xfs_defer_finish_done); DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); -DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error); DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work); DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); @@ -3092,87 +3079,6 @@ DEFINE_EVENT(xfs_double_io_class, name, \ struct xfs_inode *dest, xfs_off_t doffset), \ TP_ARGS(src, soffset, len, dest, doffset)) -/* two-file vfs io tracepoint class */ -DECLARE_EVENT_CLASS(xfs_double_vfs_io_class, - TP_PROTO(struct inode *src, u64 soffset, u64 len, - struct inode *dest, u64 doffset), - TP_ARGS(src, soffset, len, dest, doffset), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(unsigned long, src_ino) - __field(loff_t, src_isize) - __field(loff_t, src_offset) - __field(size_t, len) - __field(unsigned long, dest_ino) - __field(loff_t, dest_isize) - __field(loff_t, dest_offset) - ), - TP_fast_assign( - __entry->dev = src->i_sb->s_dev; - __entry->src_ino = src->i_ino; - __entry->src_isize = i_size_read(src); - __entry->src_offset = soffset; - __entry->len = len; - __entry->dest_ino = dest->i_ino; - __entry->dest_isize = i_size_read(dest); - __entry->dest_offset = doffset; - ), - TP_printk("dev %d:%d count %zd " - "ino 0x%lx isize 0x%llx offset 0x%llx -> " - "ino 0x%lx isize 0x%llx offset 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->len, - __entry->src_ino, - __entry->src_isize, - __entry->src_offset, - __entry->dest_ino, - __entry->dest_isize, - __entry->dest_offset) -) - -#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \ -DEFINE_EVENT(xfs_double_vfs_io_class, name, \ - TP_PROTO(struct inode *src, u64 soffset, u64 len, \ - struct inode *dest, u64 doffset), \ - TP_ARGS(src, soffset, len, dest, doffset)) - -/* CoW write tracepoint */ -DECLARE_EVENT_CLASS(xfs_copy_on_write_class, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, - xfs_extlen_t len, xfs_fsblock_t new_pblk), - TP_ARGS(ip, lblk, pblk, len, new_pblk), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_fsblock_t, pblk) - __field(xfs_extlen_t, len) - __field(xfs_fsblock_t, new_pblk) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->pblk = pblk; - __entry->len = len; - __entry->new_pblk = new_pblk; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx " - "len 0x%x new_pblk %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->pblk, - __entry->len, - __entry->new_pblk) -) - -#define DEFINE_COW_EVENT(name) \ -DEFINE_EVENT(xfs_copy_on_write_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \ - xfs_extlen_t len, xfs_fsblock_t new_pblk), \ - TP_ARGS(ip, lblk, pblk, len, new_pblk)) - /* inode/irec events */ DECLARE_EVENT_CLASS(xfs_inode_irec_class, TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), @@ -3292,8 +3198,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); @@ -3302,9 +3206,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); /* ioctl tracepoints */ -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink); -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range); -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same); TRACE_EVENT(xfs_ioctl_clone, TP_PROTO(struct inode *src, struct inode *dest), TP_ARGS(src, dest), @@ -3334,11 +3235,7 @@ TRACE_EVENT(xfs_ioctl_clone, /* unshare tracepoints */ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); -DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block); -DEFINE_PAGE_EVENT(xfs_reflink_unshare_page); DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); @@ -3361,14 +3258,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); -DEFINE_COW_EVENT(xfs_reflink_fork_buf); -DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error); -DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error); /* rmap swapext tracepoints */ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 62900938f26d..0594db435972 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -130,7 +130,7 @@ const struct xattr_handler *xfs_xattr_handlers[] = { NULL }; -static int +static void __xfs_xattr_put_listent( struct xfs_attr_list_context *context, char *prefix, @@ -148,7 +148,7 @@ __xfs_xattr_put_listent( if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ context->seen_enough = 1; - return 0; + return; } offset = (char *)context->alist + context->count; strncpy(offset, prefix, prefix_len); @@ -159,10 +159,10 @@ __xfs_xattr_put_listent( compute_size: context->count += prefix_len + namelen + 1; - return 0; + return; } -static int +static void xfs_xattr_put_listent( struct xfs_attr_list_context *context, int flags, @@ -180,23 +180,19 @@ xfs_xattr_put_listent( if (namelen == SGI_ACL_FILE_SIZE && strncmp(name, SGI_ACL_FILE, SGI_ACL_FILE_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_ACCESS, strlen(XATTR_POSIX_ACL_ACCESS)); - if (ret) - return ret; } else if (namelen == SGI_ACL_DEFAULT_SIZE && strncmp(name, SGI_ACL_DEFAULT, SGI_ACL_DEFAULT_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_DEFAULT, strlen(XATTR_POSIX_ACL_DEFAULT)); - if (ret) - return ret; } #endif @@ -205,7 +201,7 @@ xfs_xattr_put_listent( * see them. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return; prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; @@ -217,8 +213,9 @@ xfs_xattr_put_listent( prefix_len = XATTR_USER_PREFIX_LEN; } - return __xfs_xattr_put_listent(context, prefix, prefix_len, name, - namelen); + __xfs_xattr_put_listent(context, prefix, prefix_len, name, + namelen); + return; } ssize_t |