diff options
Diffstat (limited to 'fs')
92 files changed, 1206 insertions, 815 deletions
diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 52233fa6195f..887b673f6223 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -589,7 +589,7 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) */ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason) { - unsigned int debug_id = cell->debug_id; + unsigned int debug_id; time64_t now, expire_delay; int u, a; @@ -604,6 +604,7 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr if (cell->vl_servers->nr_servers) expire_delay = afs_cell_gc_delay; + debug_id = cell->debug_id; u = atomic_read(&cell->ref); a = atomic_dec_return(&cell->active); trace_afs_cell(debug_id, u, a, reason); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 1d2e61e0ab04..1bb5b9d7f0a2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -281,8 +281,7 @@ retry: if (ret < 0) goto error; - set_page_private(req->pages[i], 1); - SetPagePrivate(req->pages[i]); + attach_page_private(req->pages[i], (void *)1); unlock_page(req->pages[i]); i++; } else { @@ -1975,8 +1974,7 @@ static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags) _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); - set_page_private(page, 0); - ClearPagePrivate(page); + detach_page_private(page); /* The directory will need reloading. */ if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) @@ -2003,8 +2001,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset, afs_stat_v(dvnode, n_inval); /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_SIZE) { - set_page_private(page, 0); - ClearPagePrivate(page); - } + if (offset == 0 && length == PAGE_SIZE) + detach_page_private(page); } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index b108528bf010..2ffe09abae7f 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -243,10 +243,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode, index, gfp); if (!page) goto error; - if (!PagePrivate(page)) { - set_page_private(page, 1); - SetPagePrivate(page); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)1); dir_page = kmap(page); } diff --git a/fs/afs/file.c b/fs/afs/file.c index 371d1488cc54..85f5adf21aa0 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -33,6 +33,7 @@ const struct file_operations afs_file_operations = { .write_iter = afs_file_write, .mmap = afs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = afs_fsync, .lock = afs_lock, .flock = afs_flock, @@ -601,6 +602,63 @@ static int afs_readpages(struct file *file, struct address_space *mapping, } /* + * Adjust the dirty region of the page on truncation or full invalidation, + * getting rid of the markers altogether if the region is entirely invalidated. + */ +static void afs_invalidate_dirty(struct page *page, unsigned int offset, + unsigned int length) +{ + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + unsigned long priv; + unsigned int f, t, end = offset + length; + + priv = page_private(page); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0 && length == thp_size(page)) + goto full_invalidate; + + /* If the page was dirtied by page_mkwrite(), the PTE stays writable + * and we don't get another notification to tell us to expand it + * again. + */ + if (afs_is_page_dirty_mmapped(priv)) + return; + + /* We may need to shorten the dirty region */ + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); + + if (t <= offset || f >= end) + return; /* Doesn't overlap */ + + if (f < offset && t > end) + return; /* Splits the dirty region - just absorb it */ + + if (f >= offset && t <= end) + goto undirty; + + if (f < offset) + t = offset; + else + f = end; + if (f == t) + goto undirty; + + priv = afs_page_dirty(f, t); + set_page_private(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv); + return; + +undirty: + trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv); + clear_page_dirty_for_io(page); +full_invalidate: + priv = (unsigned long)detach_page_private(page); + trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv); +} + +/* * invalidate part or all of a page * - release a page and clean up its private data if offset is 0 (indicating * the entire page) @@ -608,31 +666,23 @@ static int afs_readpages(struct file *file, struct address_space *mapping, static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - unsigned long priv; - _enter("{%lu},%u,%u", page->index, offset, length); BUG_ON(!PageLocked(page)); +#ifdef CONFIG_AFS_FSCACHE /* we clean up only if the entire page is being invalidated */ if (offset == 0 && length == PAGE_SIZE) { -#ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); fscache_wait_on_page_write(vnode->cache, page); fscache_uncache_page(vnode->cache, page); } + } #endif - if (PagePrivate(page)) { - priv = page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("inval"), - page->index, priv); - set_page_private(page, 0); - ClearPagePrivate(page); - } - } + if (PagePrivate(page)) + afs_invalidate_dirty(page, offset, length); _leave(""); } @@ -660,11 +710,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) #endif if (PagePrivate(page)) { - priv = page_private(page); + priv = (unsigned long)detach_page_private(page); trace_afs_page_dirty(vnode, tracepoint_string("rel"), page->index, priv); - set_page_private(page, 0); - ClearPagePrivate(page); } /* indicate that the page can be released */ diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 81b0485fd22a..14d5d75f4b6e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -812,6 +812,7 @@ struct afs_operation { pgoff_t last; /* last page in mapping to deal with */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ + bool laundering; /* Laundering page, PG_writeback not set */ } store; struct { struct iattr *attr; @@ -857,6 +858,62 @@ struct afs_vnode_cache_aux { u64 data_version; } __packed; +/* + * We use page->private to hold the amount of the page that we've written to, + * splitting the field into two parts. However, we need to represent a range + * 0...PAGE_SIZE, so we reduce the resolution if the size of the page + * exceeds what we can encode. + */ +#ifdef CONFIG_64BIT +#define __AFS_PAGE_PRIV_MASK 0x7fffffffUL +#define __AFS_PAGE_PRIV_SHIFT 32 +#define __AFS_PAGE_PRIV_MMAPPED 0x80000000UL +#else +#define __AFS_PAGE_PRIV_MASK 0x7fffUL +#define __AFS_PAGE_PRIV_SHIFT 16 +#define __AFS_PAGE_PRIV_MMAPPED 0x8000UL +#endif + +static inline unsigned int afs_page_dirty_resolution(void) +{ + int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); + return (shift > 0) ? shift : 0; +} + +static inline size_t afs_page_dirty_from(unsigned long priv) +{ + unsigned long x = priv & __AFS_PAGE_PRIV_MASK; + + /* The lower bound is inclusive */ + return x << afs_page_dirty_resolution(); +} + +static inline size_t afs_page_dirty_to(unsigned long priv) +{ + unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; + + /* The upper bound is immediately beyond the region */ + return (x + 1) << afs_page_dirty_resolution(); +} + +static inline unsigned long afs_page_dirty(size_t from, size_t to) +{ + unsigned int res = afs_page_dirty_resolution(); + from >>= res; + to = (to - 1) >> res; + return (to << __AFS_PAGE_PRIV_SHIFT) | from; +} + +static inline unsigned long afs_page_dirty_mmapped(unsigned long priv) +{ + return priv | __AFS_PAGE_PRIV_MMAPPED; +} + +static inline bool afs_is_page_dirty_mmapped(unsigned long priv) +{ + return priv & __AFS_PAGE_PRIV_MMAPPED; +} + #include <trace/events/afs.h> /*****************************************************************************/ diff --git a/fs/afs/write.c b/fs/afs/write.c index da12abd6db21..50371207f327 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -76,7 +76,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, */ int afs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + struct page **_page, void **fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct page *page; @@ -90,11 +90,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping, _enter("{%llx:%llu},{%lx},%u,%u", vnode->fid.vid, vnode->fid.vnode, index, from, to); - /* We want to store information about how much of a page is altered in - * page->private. - */ - BUILD_BUG_ON(PAGE_SIZE > 32768 && sizeof(page->private) < 8); - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -110,9 +105,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping, SetPageUptodate(page); } - /* page won't leak in error case: it eventually gets cleaned off LRU */ - *pagep = page; - try_again: /* See if this page is already partially written in a way that we can * merge the new write with. @@ -120,8 +112,8 @@ try_again: t = f = 0; if (PagePrivate(page)) { priv = page_private(page); - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); ASSERTCMP(f, <=, t); } @@ -138,21 +130,9 @@ try_again: if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && (to < f || from > t)) goto flush_conflicting_write; - if (from < f) - f = from; - if (to > t) - t = to; - } else { - f = from; - t = to; } - priv = (unsigned long)t << AFS_PRIV_SHIFT; - priv |= f; - trace_afs_page_dirty(vnode, tracepoint_string("begin"), - page->index, priv); - SetPagePrivate(page); - set_page_private(page, priv); + *_page = page; _leave(" = 0"); return 0; @@ -162,17 +142,18 @@ try_again: flush_conflicting_write: _debug("flush conflict"); ret = write_one_page(page); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } + if (ret < 0) + goto error; ret = lock_page_killable(page); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } + if (ret < 0) + goto error; goto try_again; + +error: + put_page(page); + _leave(" = %d", ret); + return ret; } /* @@ -184,6 +165,9 @@ int afs_write_end(struct file *file, struct address_space *mapping, { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct key *key = afs_file_key(file); + unsigned long priv; + unsigned int f, from = pos & (PAGE_SIZE - 1); + unsigned int t, to = from + copied; loff_t i_size, maybe_i_size; int ret; @@ -215,6 +199,25 @@ int afs_write_end(struct file *file, struct address_space *mapping, SetPageUptodate(page); } + if (PagePrivate(page)) { + priv = page_private(page); + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); + if (from < f) + f = from; + if (to > t) + t = to; + priv = afs_page_dirty(f, t); + set_page_private(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), + page->index, priv); + } else { + priv = afs_page_dirty(from, to); + attach_page_private(page, (void *)priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty"), + page->index, priv); + } + set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); @@ -334,10 +337,9 @@ static void afs_pages_written_back(struct afs_vnode *vnode, ASSERTCMP(pv.nr, ==, count); for (loop = 0; loop < count; loop++) { - priv = page_private(pv.pages[loop]); + priv = (unsigned long)detach_page_private(pv.pages[loop]); trace_afs_page_dirty(vnode, tracepoint_string("clear"), pv.pages[loop]->index, priv); - set_page_private(pv.pages[loop], 0); end_page_writeback(pv.pages[loop]); } first += count; @@ -396,7 +398,8 @@ static void afs_store_data_success(struct afs_operation *op) op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); if (op->error == 0) { - afs_pages_written_back(vnode, op->store.first, op->store.last); + if (!op->store.laundering) + afs_pages_written_back(vnode, op->store.first, op->store.last); afs_stat_v(vnode, n_stores); atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) - (op->store.first * PAGE_SIZE + op->store.first_offset), @@ -415,7 +418,7 @@ static const struct afs_operation_ops afs_store_data_operation = { */ static int afs_store_data(struct address_space *mapping, pgoff_t first, pgoff_t last, - unsigned offset, unsigned to) + unsigned offset, unsigned to, bool laundering) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct afs_operation *op; @@ -448,6 +451,7 @@ static int afs_store_data(struct address_space *mapping, op->store.last = last; op->store.first_offset = offset; op->store.last_to = to; + op->store.laundering = laundering; op->mtime = vnode->vfs_inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; op->ops = &afs_store_data_operation; @@ -509,8 +513,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, */ start = primary_page->index; priv = page_private(primary_page); - offset = priv & AFS_PRIV_MAX; - to = priv >> AFS_PRIV_SHIFT; + offset = afs_page_dirty_from(priv); + to = afs_page_dirty_to(priv); trace_afs_page_dirty(vnode, tracepoint_string("store"), primary_page->index, priv); @@ -555,8 +559,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, } priv = page_private(page); - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); if (f != 0 && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { unlock_page(page); @@ -601,7 +605,7 @@ no_more: if (end > i_size) to = i_size & ~PAGE_MASK; - ret = afs_store_data(mapping, first, last, offset, to); + ret = afs_store_data(mapping, first, last, offset, to, false); switch (ret) { case 0: ret = count; @@ -857,12 +861,14 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) */ wait_on_page_writeback(vmf->page); - priv = (unsigned long)PAGE_SIZE << AFS_PRIV_SHIFT; /* To */ - priv |= 0; /* From */ + priv = afs_page_dirty(0, PAGE_SIZE); + priv = afs_page_dirty_mmapped(priv); trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), vmf->page->index, priv); - SetPagePrivate(vmf->page); - set_page_private(vmf->page, priv); + if (PagePrivate(vmf->page)) + set_page_private(vmf->page, priv); + else + attach_page_private(vmf->page, (void *)priv); file_update_time(file); sb_end_pagefault(inode->i_sb); @@ -915,19 +921,18 @@ int afs_launder_page(struct page *page) f = 0; t = PAGE_SIZE; if (PagePrivate(page)) { - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); } trace_afs_page_dirty(vnode, tracepoint_string("launder"), page->index, priv); - ret = afs_store_data(mapping, page->index, page->index, t, f); + ret = afs_store_data(mapping, page->index, page->index, t, f, true); } + priv = (unsigned long)detach_page_private(page); trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page->index, priv); - set_page_private(page, 0); - ClearPagePrivate(page); #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 84f3c4f57531..95c573dcda11 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -85,7 +85,7 @@ static int afs_xattr_get_acl(const struct xattr_handler *handler, if (acl->size <= size) memcpy(buffer, acl->data, acl->size); else - op->error = -ERANGE; + ret = -ERANGE; } } @@ -148,11 +148,6 @@ static const struct xattr_handler afs_xattr_afs_acl_handler = { .set = afs_xattr_set_acl, }; -static void yfs_acl_put(struct afs_operation *op) -{ - yfs_free_opaque_acl(op->yacl); -} - static const struct afs_operation_ops yfs_fetch_opaque_acl_operation = { .issue_yfs_rpc = yfs_fs_fetch_opaque_acl, .success = afs_acl_success, @@ -246,7 +241,7 @@ error: static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { .issue_yfs_rpc = yfs_fs_store_opaque_acl2, .success = afs_acl_success, - .put = yfs_acl_put, + .put = afs_acl_put, }; /* diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 3b1239b7e90d..bd787e71a657 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1990,6 +1990,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op) memcpy(bp, acl->data, acl->size); if (acl->size != size) memset((void *)bp + acl->size, 0, size - acl->size); + bp += size / sizeof(__be32); yfs_check_req(call, bp); trace_afs_make_fs_call(call, &vp->fid); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b6b3d052ca86..fa50e8936f5f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1690,7 +1690,7 @@ struct elf_thread_core_info { struct elf_thread_core_info *next; struct task_struct *task; struct elf_prstatus prstatus; - struct memelfnote notes[0]; + struct memelfnote notes[]; }; struct elf_note_info { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b3268f4ea5f3..771a036867dc 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -544,7 +544,18 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, int level = ref->level; struct btrfs_key search_key = ref->key_for_search; - root = btrfs_get_fs_root(fs_info, ref->root_id, false); + /* + * If we're search_commit_root we could possibly be holding locks on + * other tree nodes. This happens when qgroups does backref walks when + * adding new delayed refs. To deal with this we need to look in cache + * for the root, and if we don't find it then we need to search the + * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage + * here. + */ + if (path->search_commit_root) + root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id); + else + root = btrfs_get_fs_root(fs_info, ref->root_id, false); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c0f1d6818df7..3ba6f3839d39 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2024,6 +2024,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) key.offset = 0; btrfs_release_path(path); } + btrfs_release_path(path); list_for_each_entry(space_info, &info->space_info, list) { int i; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 7e1549a84fcc..bc920afe23bf 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -511,7 +511,8 @@ again: /*DEFAULT_RATELIMIT_BURST*/ 1); if (__ratelimit(&_rs)) WARN(1, KERN_DEBUG - "BTRFS: block rsv returned %d\n", ret); + "BTRFS: block rsv %d returned %d\n", + block_rsv->type, ret); } try_reserve: ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index aac3d6f4e35b..0378933d163c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3564,6 +3564,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct extent_buffer *eb, int err); +void btrfs_reada_remove_dev(struct btrfs_device *dev); +void btrfs_reada_undo_remove_dev(struct btrfs_device *dev); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4a0243cb9d97..10638537b9ef 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -91,6 +91,17 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { no_valid_dev_replace_entry_found: + /* + * We don't have a replace item or it's corrupted. If there is + * a replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, + BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + btrfs_err(fs_info, + "found replace target device without a valid replace item"); + ret = -EUCLEAN; + goto out; + } ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; @@ -143,8 +154,19 @@ no_valid_dev_replace_entry_found: case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - dev_replace->srcdev = NULL; - dev_replace->tgtdev = NULL; + /* + * We don't have an active replace item but if there is a + * replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, + BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + btrfs_err(fs_info, + "replace devid present without an active replace item"); + ret = -EUCLEAN; + } else { + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + } break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: @@ -688,6 +710,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + if (!scrub_ret) + btrfs_reada_remove_dev(src_device); + /* * We have to use this loop approach because at this point src_device * has to be available for transaction commit to complete, yet new @@ -696,6 +721,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, while (1) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { + btrfs_reada_undo_remove_dev(src_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return PTR_ERR(trans); } @@ -746,6 +772,7 @@ error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_reada_undo_remove_dev(src_device); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8e3438672a82..af97ddcc6b3e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1281,32 +1281,26 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) +static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, + struct btrfs_path *path, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; - struct btrfs_path *path; u64 generation; int ret; int level; - path = btrfs_alloc_path(); - if (!path) - return ERR_PTR(-ENOMEM); - root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); - if (!root) { - ret = -ENOMEM; - goto alloc_fail; - } + if (!root) + return ERR_PTR(-ENOMEM); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { if (ret > 0) ret = -ENOENT; - goto find_fail; + goto fail; } generation = btrfs_root_generation(&root->root_item); @@ -1317,21 +1311,31 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; - goto find_fail; + goto fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - goto find_fail; + goto fail; } root->commit_root = btrfs_root_node(root); -out: - btrfs_free_path(path); return root; - -find_fail: +fail: btrfs_put_root(root); -alloc_fail: - root = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); +} + +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) +{ + struct btrfs_root *root; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + root = read_tree_root_path(tree_root, path, key); + btrfs_free_path(path); + + return root; } /* @@ -1419,6 +1423,31 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, return root; } +static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, + u64 objectid) +{ + if (objectid == BTRFS_ROOT_TREE_OBJECTID) + return btrfs_grab_root(fs_info->tree_root); + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + return btrfs_grab_root(fs_info->extent_root); + if (objectid == BTRFS_CHUNK_TREE_OBJECTID) + return btrfs_grab_root(fs_info->chunk_root); + if (objectid == BTRFS_DEV_TREE_OBJECTID) + return btrfs_grab_root(fs_info->dev_root); + if (objectid == BTRFS_CSUM_TREE_OBJECTID) + return btrfs_grab_root(fs_info->csum_root); + if (objectid == BTRFS_QUOTA_TREE_OBJECTID) + return btrfs_grab_root(fs_info->quota_root) ? + fs_info->quota_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_UUID_TREE_OBJECTID) + return btrfs_grab_root(fs_info->uuid_root) ? + fs_info->uuid_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + return btrfs_grab_root(fs_info->free_space_root) ? + fs_info->free_space_root : ERR_PTR(-ENOENT); + return NULL; +} + int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { @@ -1518,25 +1547,9 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, struct btrfs_key key; int ret; - if (objectid == BTRFS_ROOT_TREE_OBJECTID) - return btrfs_grab_root(fs_info->tree_root); - if (objectid == BTRFS_EXTENT_TREE_OBJECTID) - return btrfs_grab_root(fs_info->extent_root); - if (objectid == BTRFS_CHUNK_TREE_OBJECTID) - return btrfs_grab_root(fs_info->chunk_root); - if (objectid == BTRFS_DEV_TREE_OBJECTID) - return btrfs_grab_root(fs_info->dev_root); - if (objectid == BTRFS_CSUM_TREE_OBJECTID) - return btrfs_grab_root(fs_info->csum_root); - if (objectid == BTRFS_QUOTA_TREE_OBJECTID) - return btrfs_grab_root(fs_info->quota_root) ? - fs_info->quota_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_UUID_TREE_OBJECTID) - return btrfs_grab_root(fs_info->uuid_root) ? - fs_info->uuid_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return btrfs_grab_root(fs_info->free_space_root) ? - fs_info->free_space_root : ERR_PTR(-ENOENT); + root = btrfs_get_global_root(fs_info, objectid); + if (root) + return root; again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { @@ -1622,6 +1635,52 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, } /* + * btrfs_get_fs_root_commit_root - return a root for the given objectid + * @fs_info: the fs_info + * @objectid: the objectid we need to lookup + * + * This is exclusively used for backref walking, and exists specifically because + * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref + * creation time, which means we may have to read the tree_root in order to look + * up a fs root that is not in memory. If the root is not in memory we will + * read the tree root commit root and look up the fs root from there. This is a + * temporary root, it will not be inserted into the radix tree as it doesn't + * have the most uptodate information, it'll simply be discarded once the + * backref code is finished using the root. + */ +struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_root *root; + struct btrfs_key key; + + ASSERT(path->search_commit_root && path->skip_locking); + + /* + * This can return -ENOENT if we ask for a root that doesn't exist, but + * since this is called via the backref walking code we won't be looking + * up a root that doesn't exist, unless there's corruption. So if root + * != NULL just return it. + */ + root = btrfs_get_global_root(fs_info, objectid); + if (root) + return root; + + root = btrfs_lookup_fs_root(fs_info, objectid); + if (root) + return root; + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = read_tree_root_path(fs_info->tree_root, path, &key); + btrfs_release_path(path); + + return root; +} + +/* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens */ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index fee69ced58b4..182540bdcea0 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -69,6 +69,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, dev_t anon_dev); +struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 objectid); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3b21fee13e77..5fd60b13f4f8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3185,7 +3185,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_tree_block_info *bi; if (item_size < sizeof(*ei) + sizeof(*bi)) { btrfs_crit(info, -"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu", +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu", key.objectid, key.type, key.offset, owner_objectid, item_size, sizeof(*ei) + sizeof(*bi)); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0ff659455b1e..87355a38a654 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3628,7 +3628,8 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) inode_lock_shared(inode); ret = btrfs_direct_IO(iocb, to); inode_unlock_shared(inode); - if (ret < 0) + if (ret < 0 || !iov_iter_count(to) || + iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 936c3137c646..da58c58ef9aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9672,10 +9672,16 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, * clear_offset by our extent size. */ clear_offset += ins.offset; - btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + /* + * Now that we inserted the prealloc extent we can finally + * decrement the number of reservations in the block group. + * If we did it before, we could race with relocation and have + * relocation miss the reserved extent, making it fail later. + */ + btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab408a23ba32..69a384145dc6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1274,6 +1274,7 @@ static int cluster_pages_for_defrag(struct inode *inode, u64 page_start; u64 page_end; u64 page_cnt; + u64 start = (u64)start_index << PAGE_SHIFT; int ret; int i; int i_done; @@ -1290,8 +1291,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - start_index << PAGE_SHIFT, - page_cnt << PAGE_SHIFT); + start, page_cnt << PAGE_SHIFT); if (ret) return ret; i_done = 0; @@ -1380,8 +1380,7 @@ again: btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start_index << PAGE_SHIFT, - (page_cnt - i_done) << PAGE_SHIFT, true); + start, (page_cnt - i_done) << PAGE_SHIFT, true); } @@ -1408,8 +1407,7 @@ out: put_page(pages[i]); } btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start_index << PAGE_SHIFT, - page_cnt << PAGE_SHIFT, true); + start, page_cnt << PAGE_SHIFT, true); btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 580899bdb991..77c54749f432 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1026,6 +1026,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_ROOT_REF_KEY) { + + /* Release locks on tree_root before we access quota_root */ + btrfs_release_path(path); + ret = add_qgroup_item(trans, quota_root, found_key.offset); if (ret) { @@ -1044,6 +1048,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_abort_transaction(trans, ret); goto out_free_path; } + ret = btrfs_search_slot_for_read(tree_root, &found_key, + path, 1, 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + if (ret > 0) { + /* + * Shouldn't happen, but in case it does we + * don't need to do the btrfs_next_item, just + * continue. + */ + continue; + } } ret = btrfs_next_item(tree_root, path); if (ret < 0) { @@ -3417,24 +3435,20 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, { struct rb_node *node; struct rb_node *next; - struct ulist_node *entry = NULL; + struct ulist_node *entry; int ret = 0; node = reserved->range_changed.root.rb_node; + if (!node) + return 0; while (node) { entry = rb_entry(node, struct ulist_node, rb_node); if (entry->val < start) node = node->rb_right; - else if (entry) - node = node->rb_left; else - break; + node = node->rb_left; } - /* Empty changeset */ - if (!entry) - return 0; - if (entry->val > start && rb_prev(&entry->rb_node)) entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, rb_node); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 9d4f5316a7e8..d9a166eb344e 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -421,6 +421,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!dev->bdev) continue; + if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state)) + continue; + if (dev_replace_is_ongoing && dev == fs_info->dev_replace.tgtdev) { /* @@ -445,6 +448,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, } have_zone = 1; } + if (!have_zone) + radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); up_read(&fs_info->dev_replace.rwsem); @@ -1020,3 +1025,45 @@ void btrfs_reada_detach(void *handle) kref_put(&rc->refcnt, reada_control_release); } + +/* + * Before removing a device (device replace or device remove ioctls), call this + * function to wait for all existing readahead requests on the device and to + * make sure no one queues more readahead requests for the device. + * + * Must be called without holding neither the device list mutex nor the device + * replace semaphore, otherwise it will deadlock. + */ +void btrfs_reada_remove_dev(struct btrfs_device *dev) +{ + struct btrfs_fs_info *fs_info = dev->fs_info; + + /* Serialize with readahead extent creation at reada_find_extent(). */ + spin_lock(&fs_info->reada_lock); + set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); + spin_unlock(&fs_info->reada_lock); + + /* + * There might be readahead requests added to the radix trees which + * were not yet added to the readahead work queue. We need to start + * them and wait for their completion, otherwise we can end up with + * use-after-free problems when dropping the last reference on the + * readahead extents and their zones, as they need to access the + * device structure. + */ + reada_start_machine(fs_info); + btrfs_flush_workqueue(fs_info->readahead_workers); +} + +/* + * If when removing a device (device replace or device remove ioctls) an error + * happens after calling btrfs_reada_remove_dev(), call this to undo what that + * function did. This is safe to call even if btrfs_reada_remove_dev() was not + * called before. + */ +void btrfs_reada_undo_remove_dev(struct btrfs_device *dev) +{ + spin_lock(&dev->fs_info->reada_lock); + clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); + spin_unlock(&dev->fs_info->reada_lock); +} diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 7f03dbe5b609..78693d3dd15b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -860,6 +860,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3602806d71bd..9ba92d86da0b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1648,6 +1648,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root_item *root_item; struct btrfs_path *path; struct extent_buffer *leaf; + int reserve_level; int level; int max_level; int replaced = 0; @@ -1696,7 +1697,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * Thus the needed metadata size is at most root_level * nodesize, * and * 2 since we have two trees to COW. */ - min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2; + reserve_level = max_t(int, 1, btrfs_root_level(root_item)); + min_reserved = fs_info->nodesize * reserve_level * 2; memset(&next_key, 0, sizeof(next_key)); while (1) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cf63f1e27a27..e71e7586e9eb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3866,8 +3866,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", - rcu_str_deref(dev->name)); + btrfs_err_in_rcu(fs_info, + "scrub on devid %llu: filesystem on %s is not writable", + devid, rcu_str_deref(dev->name)); ret = -EROFS; goto out; } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index f0ffd5ee77bd..8784b74f5232 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -760,18 +760,36 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, u64 type; u64 features; bool mixed = false; + int raid_index; + int nparity; + int ncopies; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); + raid_index = btrfs_bg_flags_to_raid_index(type); + ncopies = btrfs_raid_array[raid_index].ncopies; + nparity = btrfs_raid_array[raid_index].nparity; if (!num_stripes) { chunk_err(leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } + if (num_stripes < ncopies) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes < ncopies, have %u < %d", + num_stripes, ncopies); + return -EUCLEAN; + } + if (nparity && num_stripes == nparity) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes == nparity, have %u == %d", + num_stripes, nparity); + return -EUCLEAN; + } if (!IS_ALIGNED(logical, fs_info->sectorsize)) { chunk_err(leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 58b9c419a2b6..a6406b3b8c2b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -431,7 +431,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); - btrfs_device_data_ordered_init(dev); + btrfs_device_data_ordered_init(dev, fs_info); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); extent_io_tree_init(fs_info, &dev->alloc_state, @@ -1056,22 +1056,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; } - if (device->devid == BTRFS_DEV_REPLACE_DEVID) { - /* - * In the first step, keep the device which has - * the correct fsid and the devid that is used - * for the dev_replace procedure. - * In the second step, the dev_replace state is - * read from the device tree and it is known - * whether the procedure is really active or - * not, which means whether this device is - * used or whether it should be removed. - */ - if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state)) { - continue; - } - } + /* + * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, + * in btrfs_init_dev_replace() so just continue. + */ + if (device->devid == BTRFS_DEV_REPLACE_DEVID) + continue; + if (device->bdev) { blkdev_put(device->bdev, device->mode); device->bdev = NULL; @@ -1080,9 +1071,6 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state)) - fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; @@ -2099,6 +2087,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); + if (!ret) + btrfs_reada_remove_dev(device); mutex_lock(&uuid_mutex); if (ret) goto error_undo; @@ -2179,6 +2169,7 @@ out: return ret; error_undo: + btrfs_reada_undo_remove_dev(device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bf27ac07d315..232f02bd214f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -39,10 +39,10 @@ struct btrfs_io_geometry { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __BTRFS_NEED_DEVICE_DATA_ORDERED -#define btrfs_device_data_ordered_init(device) \ - seqcount_init(&device->data_seqcount) +#define btrfs_device_data_ordered_init(device, info) \ + seqcount_mutex_init(&device->data_seqcount, &info->chunk_mutex) #else -#define btrfs_device_data_ordered_init(device) do { } while (0) +#define btrfs_device_data_ordered_init(device, info) do { } while (0) #endif #define BTRFS_DEV_STATE_WRITEABLE (0) @@ -50,6 +50,7 @@ struct btrfs_io_geometry { #define BTRFS_DEV_STATE_MISSING (2) #define BTRFS_DEV_STATE_REPLACE_TGT (3) #define BTRFS_DEV_STATE_FLUSH_SENT (4) +#define BTRFS_DEV_STATE_NO_READA (5) struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ @@ -71,7 +72,8 @@ struct btrfs_device { blk_status_t last_flush_error; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED - seqcount_t data_seqcount; + /* A seqcount_t with associated chunk_mutex (for lockdep) */ + seqcount_mutex_t data_seqcount; #endif /* the internal btrfs device id */ @@ -162,11 +164,9 @@ btrfs_device_get_##name(const struct btrfs_device *dev) \ static inline void \ btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ { \ - preempt_disable(); \ write_seqcount_begin(&dev->data_seqcount); \ dev->name = size; \ write_seqcount_end(&dev->data_seqcount); \ - preempt_enable(); \ } #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) #define BTRFS_DEVICE_GETSET_FUNCS(name) \ diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 3080cda9e824..8bda092e60c5 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -121,7 +121,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, _debug("reissue read"); ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) - goto unlock_discard; + goto discard; } /* but the page may have been read before the monitor was installed, so @@ -138,6 +138,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, unlock_discard: unlock_page(backpage); +discard: spin_lock_irq(&object->work_lock); list_del(&monitor->op_link); spin_unlock_irq(&object->work_lock); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5027bbdca419..ded4229c314a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4074,7 +4074,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap, inode); mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 08f1c0c31dc2..8f1d7500a7ec 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4231,7 +4231,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, dname.len, dname.name); mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); @@ -4385,29 +4385,49 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { - if (s->s_state == CEPH_MDS_SESSION_CLOSING) { - dout("resending session close request for mds%d\n", - s->s_mds); - request_close_session(s); - return false; - } - if (s->s_ttl && time_after(jiffies, s->s_ttl)) { - if (s->s_state == CEPH_MDS_SESSION_OPEN) { + switch (s->s_state) { + case CEPH_MDS_SESSION_OPEN: + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { s->s_state = CEPH_MDS_SESSION_HUNG; pr_info("mds%d hung\n", s->s_mds); } - } - if (s->s_state == CEPH_MDS_SESSION_NEW || - s->s_state == CEPH_MDS_SESSION_RESTARTING || - s->s_state == CEPH_MDS_SESSION_CLOSED || - s->s_state == CEPH_MDS_SESSION_REJECTED) - /* this mds is failed or recovering, just wait */ + break; + case CEPH_MDS_SESSION_CLOSING: + /* Should never reach this when we're unmounting */ + WARN_ON_ONCE(true); + fallthrough; + case CEPH_MDS_SESSION_NEW: + case CEPH_MDS_SESSION_RESTARTING: + case CEPH_MDS_SESSION_CLOSED: + case CEPH_MDS_SESSION_REJECTED: return false; + } return true; } /* + * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply, + * then we need to retransmit that request. + */ +void inc_session_sequence(struct ceph_mds_session *s) +{ + lockdep_assert_held(&s->s_mutex); + + s->s_seq++; + + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + int ret; + + dout("resending session close request for mds%d\n", s->s_mds); + ret = request_close_session(s); + if (ret < 0) + pr_err("unable to close session to mds%d: %d\n", + s->s_mds, ret); + } +} + +/* * delayed work -- periodically trim expired leases, renew caps with mds */ static void schedule_delayed(struct ceph_mds_client *mdsc) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index cbf8af437140..f5adbebcb38e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -480,6 +480,7 @@ struct ceph_mds_client { extern const char *ceph_mds_op_name(int op); extern bool check_session_state(struct ceph_mds_session *s); +void inc_session_sequence(struct ceph_mds_session *s); extern struct ceph_mds_session * __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 83cb4f26b689..9b785f11e95a 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -53,7 +53,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, /* increment msg sequence number */ mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); mutex_unlock(&session->s_mutex); /* lookup inode */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0da39c16dab4..b611f829cb61 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -873,7 +873,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, ceph_snap_op_name(op), split, trace_len); mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); mutex_unlock(&session->s_mutex); down_write(&mdsc->snap_rwsem); diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 89bffa82ed74..c57bebfa48fe 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -74,7 +74,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) int i; /* The file must need contents encryption, not filenames encryption */ - if (!fscrypt_needs_contents_encryption(inode)) + if (!S_ISREG(inode->i_mode)) return 0; /* The crypto mode must have a blk-crypto counterpart */ diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index d3c3e5d9b41f..d595abb8ef90 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -269,9 +269,7 @@ unlock: * New inodes may not have an inode number assigned yet. * Hashing their inode number is delayed until later. */ - if (ci->ci_inode->i_ino == 0) - WARN_ON(!(ci->ci_inode->i_state & I_CREATING)); - else + if (ci->ci_inode->i_ino) fscrypt_hash_inode_number(ci, mk); return 0; } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index a768a09430c3..686e0ad28788 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1127,24 +1127,23 @@ static const struct file_operations debugfs_devm_entry_ops = { * file will be created in the root of the debugfs filesystem. * @read_fn: function pointer called to print the seq_file content. */ -struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, - struct dentry *parent, - int (*read_fn)(struct seq_file *s, - void *data)) +void debugfs_create_devm_seqfile(struct device *dev, const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, void *data)) { struct debugfs_devm_entry *entry; if (IS_ERR(parent)) - return ERR_PTR(-ENOENT); + return; entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); if (!entry) - return ERR_PTR(-ENOMEM); + return; entry->read = read_fn; entry->dev = dev; - return debugfs_create_file(name, S_IRUGO, parent, entry, - &debugfs_devm_entry_ops); + debugfs_create_file(name, S_IRUGO, parent, entry, + &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 139d0bed42f8..3e21c0e8adae 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -107,11 +107,9 @@ static struct page *erofs_read_inode(struct inode *inode, i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - /* ns timestamp */ - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = - le64_to_cpu(die->i_ctime); - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = - le32_to_cpu(die->i_ctime_nsec); + /* extended inode has its own timestamp */ + inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec); inode->i_size = le64_to_cpu(die->i_size); @@ -149,11 +147,9 @@ static struct page *erofs_read_inode(struct inode *inode, i_gid_write(inode, le16_to_cpu(dic->i_gid)); set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time to derive all file time */ - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = - sbi->build_time; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = - sbi->build_time_nsec; + /* use build time for compact inodes */ + inode->i_ctime.tv_sec = sbi->build_time; + inode->i_ctime.tv_nsec = sbi->build_time_nsec; inode->i_size = le32_to_cpu(dic->i_size); if (erofs_inode_is_data_compressed(vi->datalayout)) @@ -167,6 +163,11 @@ static struct page *erofs_read_inode(struct inode *inode, goto err_out; } + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; + inode->i_atime.tv_sec = inode->i_ctime.tv_sec; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; + inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + if (!nblks) /* measure inode.i_blocks as generic filesystems */ inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 50912a5420b4..86fd3bf62af6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1078,8 +1078,11 @@ out_allocpage: cond_resched(); goto repeat; } - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + + if (tocache) { + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); + } out: /* the only exit (for tracing and debugging) */ return page; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5b81f3b080ee..ca50c90adc4c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -669,68 +669,8 @@ const struct file_operations ext4_dir_operations = { }; #ifdef CONFIG_UNICODE -static int ext4_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) -{ - struct qstr qstr = {.name = str, .len = len }; - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *inode = d_inode_rcu(parent); - char strbuf[DNAME_INLINE_LEN]; - - if (!inode || !IS_CASEFOLDED(inode) || - !EXT4_SB(inode->i_sb)->s_encoding) { - if (len != name->len) - return -1; - return memcmp(str, name->name, len); - } - - /* - * If the dentry name is stored in-line, then it may be concurrently - * modified by a rename. If this happens, the VFS will eventually retry - * the lookup, so it doesn't matter what ->d_compare() returns. - * However, it's unsafe to call utf8_strncasecmp() with an unstable - * string. Therefore, we have to copy the name into a temporary buffer. - */ - if (len <= DNAME_INLINE_LEN - 1) { - memcpy(strbuf, str, len); - strbuf[len] = 0; - qstr.name = strbuf; - /* prevent compiler from optimizing out the temporary buffer */ - barrier(); - } - - return ext4_ci_compare(inode, name, &qstr, false); -} - -static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) -{ - const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); - const struct unicode_map *um = sbi->s_encoding; - const struct inode *inode = d_inode_rcu(dentry); - unsigned char *norm; - int len, ret = 0; - - if (!inode || !IS_CASEFOLDED(inode) || !um) - return 0; - - norm = kmalloc(PATH_MAX, GFP_ATOMIC); - if (!norm) - return -ENOMEM; - - len = utf8_casefold(um, str, norm, PATH_MAX); - if (len < 0) { - if (ext4_has_strict_mode(sbi)) - ret = -EINVAL; - goto out; - } - str->hash = full_name_hash(dentry, norm, len); -out: - kfree(norm); - return ret; -} - const struct dentry_operations ext4_dentry_ops = { - .d_hash = ext4_d_hash, - .d_compare = ext4_d_compare, + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, }; #endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 254d1c26bea8..bf9429484462 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1028,9 +1028,6 @@ struct ext4_inode_info { * protected by sbi->s_fc_lock. */ - /* Fast commit subtid when this inode was committed */ - unsigned int i_fc_committed_subtid; - /* Start of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_start; @@ -1166,10 +1163,6 @@ struct ext4_inode_info { #define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT4_ERROR_FS 0x0002 /* Errors detected */ #define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ -#define EXT4_FC_INELIGIBLE 0x0008 /* Fast commit ineligible */ -#define EXT4_FC_COMMITTING 0x0010 /* File system underoing a fast - * commit. - */ #define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ /* @@ -1238,13 +1231,13 @@ struct ext4_inode_info { blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ -#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */ -#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */ - #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ +#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1426,12 +1419,6 @@ struct ext4_super_block { #ifdef __KERNEL__ -/* - * run-time mount flags - */ -#define EXT4_MF_MNTDIR_SAMPLED 0x0001 -#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ - #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL) #else @@ -1444,14 +1431,6 @@ struct ext4_super_block { #define EXT4_ENC_UTF8_12_1 1 /* - * Flags for ext4_sb_info.s_encoding_flags. - */ -#define EXT4_ENC_STRICT_MODE_FL (1 << 0) - -#define ext4_has_strict_mode(sbi) \ - (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL) - -/* * fourth extended-fs super-block data in memory */ struct ext4_sb_info { @@ -1474,7 +1453,7 @@ struct ext4_sb_info { struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; - unsigned int s_mount_flags; + unsigned long s_mount_flags; unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; @@ -1500,10 +1479,6 @@ struct ext4_sb_info { struct kobject s_kobj; struct completion s_kobj_unregister; struct super_block *s_sb; -#ifdef CONFIG_UNICODE - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif /* Journaling */ struct journal_s *s_journal; @@ -1707,6 +1682,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) }) /* + * run-time mount flags + */ +enum { + EXT4_MF_MNTDIR_SAMPLED, + EXT4_MF_FS_ABORTED, /* Fatal error detected */ + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_FC_COMMITTING /* File system underoing a fast + * commit. + */ +}; + +static inline void ext4_set_mount_flag(struct super_block *sb, int bit) +{ + set_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) +{ + clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline int ext4_test_mount_flag(struct super_block *sb, int bit) +{ + return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + + +/* * Simulate_fail codes */ #define EXT4_SIM_BBITMAP_EIO 1 @@ -1875,6 +1878,13 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 +/* + * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes + * incompatible only if fast commit blocks are present in the FS. Since we + * clear the journal (and thus the fast commit blocks), we don't mark FS as + * incompatible. We also have a JBD2 incompat feature, which gets set when + * there are fast commit blocks present in the journal. + */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 @@ -2743,12 +2753,16 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); int ext4_fc_info_show(struct seq_file *seq, void *v); void ext4_fc_init(struct super_block *sb, journal_t *journal); void ext4_fc_init_inode(struct inode *inode); -void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); -void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_link(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_create(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_inode(struct inode *inode); +void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void __ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason); void ext4_fc_start_ineligible(struct super_block *sb, int reason); void ext4_fc_stop_ineligible(struct super_block *sb); @@ -3464,7 +3478,7 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, extern int ext4_ci_compare(const struct inode *parent, const struct qstr *fname, const struct qstr *entry, bool quick); -extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, +extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, struct inode *inode); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 559100f3e23c..17d7096b3212 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1471,16 +1471,16 @@ static int ext4_ext_search_left(struct inode *inode, } /* - * search the closest allocated block to the right for *logical - * and returns it at @logical + it's physical address at @phys - * if *logical is the largest allocated block, the function - * returns 0 at @phys - * return value contains 0 (success) or error code + * Search the closest allocated block to the right for *logical + * and returns it at @logical + it's physical address at @phys. + * If not exists, return 0 and @phys is set to 0. We will return + * 1 which means we found an allocated block and ret_ex is valid. + * Or return a (< 0) error code. */ static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, - struct ext4_extent **ret_ex) + struct ext4_extent *ret_ex) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1574,10 +1574,11 @@ got_index: found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); - *ret_ex = ex; + if (ret_ex) + *ret_ex = *ex; if (bh) put_bh(bh); - return 0; + return 1; } /* @@ -2868,8 +2869,8 @@ again: */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, - &ex); - if (err) + NULL); + if (err < 0) goto out; if (pblk) { partial.pclu = EXT4_B2C(sbi, pblk); @@ -3723,7 +3724,6 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); - ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1); return err; } @@ -3795,7 +3795,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; - ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1); return 0; } @@ -4039,7 +4038,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; - struct ext4_extent newex, *ex, *ex2; + struct ext4_extent newex, *ex, ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0, pblk; int err = 0, depth, ret; @@ -4175,15 +4174,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out; ar.lright = map->m_lblk; - ex2 = NULL; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); - if (err) + if (err < 0) goto out; /* Check if the extent after searching to the right implies a * cluster we can use. */ - if ((sbi->s_cluster_ratio > 1) && ex2 && - get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { + if ((sbi->s_cluster_ratio > 1) && err && + get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; goto got_allocated_blocks; @@ -4329,7 +4327,6 @@ got_allocated_blocks: map->m_len = ar.len; allocated = map->m_len; ext4_ext_show_leaf(inode, path); - ext4_fc_track_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1); out: ext4_ext_drop_refs(path); kfree(path); @@ -4602,7 +4599,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - ext4_fc_track_range(inode, offset >> inode->i_sb->s_blocksize_bits, + ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits, (offset + len - 1) >> inode->i_sb->s_blocksize_bits); /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); @@ -4651,8 +4648,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - ext4_fc_track_range(inode, offset >> blkbits, - (offset + len - 1) >> blkbits); ext4_fc_start_update(inode); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 447c8d93f480..f2033e13a273 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -83,7 +83,7 @@ * * Atomicity of commits * -------------------- - * In order to gaurantee atomicity during the commit operation, fast commit + * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit @@ -152,7 +152,31 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); init_waitqueue_head(&ei->i_fc_wait); atomic_set(&ei->i_fc_updates, 0); - ei->i_fc_committed_subtid = 0; +} + +/* This function must be called with sbi->s_fc_lock held. */ +static void ext4_fc_wait_committing_inode(struct inode *inode) +__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) +{ + wait_queue_head_t *wq; + struct ext4_inode_info *ei = EXT4_I(inode); + +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); } /* @@ -176,22 +200,7 @@ restart: goto out; if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - wait_queue_head_t *wq; -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); + ext4_fc_wait_committing_inode(inode); goto restart; } out: @@ -234,26 +243,10 @@ restart: } if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - wait_queue_head_t *wq; -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); + ext4_fc_wait_committing_inode(inode); goto restart; } - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); } @@ -269,7 +262,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; - sbi->s_mount_state |= EXT4_FC_INELIGIBLE; + ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -292,7 +285,7 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason) } /* - * Stop a fast commit ineligible update. We set EXT4_FC_INELIGIBLE flag here + * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here * to ensure that after stopping the ineligible update, at least one full * commit takes place. */ @@ -302,14 +295,14 @@ void ext4_fc_stop_ineligible(struct super_block *sb) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; - EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE; + ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); } static inline int ext4_fc_is_ineligible(struct super_block *sb) { - return (EXT4_SB(sb)->s_mount_state & EXT4_FC_INELIGIBLE) || - atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates); + return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || + atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); } /* @@ -323,13 +316,14 @@ static inline int ext4_fc_is_ineligible(struct super_block *sb) * If enqueue is set, this function enqueues the inode in fast commit list. */ static int ext4_fc_track_template( - struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), + handle_t *handle, struct inode *inode, + int (*__fc_track_fn)(struct inode *, void *, bool), void *args, int enqueue) { - tid_t running_txn_tid; bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + tid_t tid = 0; int ret; if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || @@ -339,15 +333,13 @@ static int ext4_fc_track_template( if (ext4_fc_is_ineligible(inode->i_sb)) return -EINVAL; - running_txn_tid = sbi->s_journal ? - sbi->s_journal->j_commit_sequence + 1 : 0; - + tid = handle->h_transaction->t_tid; mutex_lock(&ei->i_fc_lock); - if (running_txn_tid == ei->i_sync_tid) { + if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); - ei->i_sync_tid = running_txn_tid; + ei->i_sync_tid = tid; } ret = __fc_track_fn(inode, args, update); mutex_unlock(&ei->i_fc_lock); @@ -358,7 +350,7 @@ static int ext4_fc_track_template( spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, - (sbi->s_mount_state & EXT4_FC_COMMITTING) ? + (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); @@ -384,7 +376,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -397,7 +389,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_MEM); + EXT4_FC_REASON_NOMEM); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -411,7 +403,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) node->fcd_name.len = dentry->d_name.len; spin_lock(&sbi->s_fc_lock); - if (sbi->s_mount_state & EXT4_FC_COMMITTING) + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else @@ -422,7 +414,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) return 0; } -void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) +void __ext4_fc_track_unlink(handle_t *handle, + struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; @@ -430,12 +423,18 @@ void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) args.dentry = dentry; args.op = EXT4_FC_TAG_UNLINK; - ret = ext4_fc_track_template(inode, __track_dentry_update, + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_unlink(inode, dentry, ret); } -void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) +{ + __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); +} + +void __ext4_fc_track_link(handle_t *handle, + struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; @@ -443,20 +442,26 @@ void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) args.dentry = dentry; args.op = EXT4_FC_TAG_LINK; - ret = ext4_fc_track_template(inode, __track_dentry_update, + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_link(inode, dentry, ret); } -void ext4_fc_track_create(struct inode *inode, struct dentry *dentry) +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) +{ + __ext4_fc_track_link(handle, d_inode(dentry), dentry); +} + +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct __track_dentry_update_args args; + struct inode *inode = d_inode(dentry); int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_CREAT; - ret = ext4_fc_track_template(inode, __track_dentry_update, + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_create(inode, dentry, ret); } @@ -472,14 +477,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update) return 0; } -void ext4_fc_track_inode(struct inode *inode) +void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { int ret; if (S_ISDIR(inode->i_mode)) return; - ret = ext4_fc_track_template(inode, __track_inode, NULL, 1); + if (ext4_should_journal_data(inode)) { + ext4_fc_mark_ineligible(inode->i_sb, + EXT4_FC_REASON_INODE_JOURNAL_DATA); + return; + } + + ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(inode, ret); } @@ -515,7 +526,7 @@ static int __track_range(struct inode *inode, void *arg, bool update) return 0; } -void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct __track_range_args args; @@ -527,7 +538,7 @@ void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, args.start = start; args.end = end; - ret = ext4_fc_track_template(inode, __track_range, &args, 1); + ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); trace_ext4_fc_track_range(inode, start, end, ret); } @@ -537,10 +548,11 @@ static void ext4_fc_submit_bh(struct super_block *sb) int write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; + /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */ if (test_opt(sb, BARRIER)) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); - clear_buffer_dirty(bh); + set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; submit_bh(REQ_OP_WRITE, write_flags, bh); @@ -846,7 +858,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) int ret = 0; spin_lock(&sbi->s_fc_lock); - sbi->s_mount_state |= EXT4_FC_COMMITTING; + ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { ei = list_entry(pos, struct ext4_inode_info, i_fc_list); ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); @@ -900,6 +912,8 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal) /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) +__acquires(&sbi->s_fc_lock) +__releases(&sbi->s_fc_lock) { struct super_block *sb = (struct super_block *)(journal->j_private); struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -964,7 +978,6 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) fc_dentry->fcd_parent, fc_dentry->fcd_ino, fc_dentry->fcd_name.len, fc_dentry->fcd_name.name, crc)) { - spin_lock(&sbi->s_fc_lock); ret = -ENOSPC; goto lock_and_exit; } @@ -997,6 +1010,13 @@ static int ext4_fc_perform_commit(journal_t *journal) if (ret) return ret; + /* + * If file system device is different from journal device, issue a cache + * flush before we start writing fast commit blocks. + */ + if (journal->j_fs_dev != journal->j_dev) + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); + blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { /* @@ -1032,8 +1052,6 @@ static int ext4_fc_perform_commit(journal_t *journal) if (ret) goto out; spin_lock(&sbi->s_fc_lock); - EXT4_I(inode)->i_fc_committed_subtid = - atomic_read(&sbi->s_fc_subtid); } spin_unlock(&sbi->s_fc_lock); @@ -1132,7 +1150,7 @@ out: "Fast commit ended with blks = %d, reason = %d, subtid - %d", nblks, reason, subtid); if (reason == EXT4_FC_REASON_FC_FAILED) - return jbd2_fc_end_commit_fallback(journal, commit_tid); + return jbd2_fc_end_commit_fallback(journal); if (reason == EXT4_FC_REASON_FC_START_FAILED || reason == EXT4_FC_REASON_INELIGIBLE) return jbd2_complete_transaction(journal, commit_tid); @@ -1191,8 +1209,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_STAGING]); - sbi->s_mount_state &= ~EXT4_FC_COMMITTING; - sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE; + ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (full) sbi->s_fc_bytes = 0; @@ -1264,7 +1282,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) return 0; } - ret = __ext4_unlink(old_parent, &entry, inode); + ret = __ext4_unlink(NULL, old_parent, &entry, inode); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; @@ -1617,8 +1635,10 @@ static int ext4_fc_replay_add_range(struct super_block *sb, if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, NULL, 0); - if (!path) - continue; + if (IS_ERR(path)) { + iput(inode); + return 0; + } memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( @@ -2087,13 +2107,9 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; - if (jbd2_fc_init(journal, EXT4_NUM_FC_BLKS)) { - pr_warn("Error while enabling fast commits, turning off."); - ext4_clear_feature_fast_commit(sb); - } } -const char *fc_ineligible_reasons[] = { +static const char *fc_ineligible_reasons[] = { "Extended attributes changed", "Cross rename", "Journal flag changed", @@ -2102,6 +2118,7 @@ const char *fc_ineligible_reasons[] = { "Resize", "Dir renamed", "Falloc range op", + "Data journalling", "FC Commit Failed" }; diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 06907d485989..3a6e5a1fa1b8 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -3,9 +3,6 @@ #ifndef __FAST_COMMIT_H__ #define __FAST_COMMIT_H__ -/* Number of blocks in journal area to allocate for fast commits */ -#define EXT4_NUM_FC_BLKS 256 - /* Fast commit tags */ #define EXT4_FC_TAG_ADD_RANGE 0x0001 #define EXT4_FC_TAG_DEL_RANGE 0x0002 @@ -100,11 +97,12 @@ enum { EXT4_FC_REASON_XATTR = 0, EXT4_FC_REASON_CROSS_RENAME, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, - EXT4_FC_REASON_MEM, + EXT4_FC_REASON_NOMEM, EXT4_FC_REASON_SWAP_BOOT, EXT4_FC_REASON_RESIZE, EXT4_FC_REASON_RENAME_DIR, EXT4_FC_REASON_FALLOC_RANGE, + EXT4_FC_REASON_INODE_JOURNAL_DATA, EXT4_FC_COMMIT_FAILED, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d85412d12e3a..3ed8c048fb12 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -761,7 +761,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; - ext4_fc_start_update(inode); file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -769,7 +768,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) } else { vma->vm_ops = &ext4_file_vm_ops; } - ext4_fc_stop_update(inode); return 0; } @@ -782,13 +780,13 @@ static int ext4_sample_last_mounted(struct super_block *sb, handle_t *handle; int err; - if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) + if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; - sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); /* * Sample where the filesystem has been mounted and * store it in the superblock for sysadmin convenience diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index b232c2767534..4c2a9fe30067 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -280,7 +280,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, /* Fabricate an rmap entry for the external log device. */ irec.fmr_physical = journal->j_blk_offset; - irec.fmr_length = journal->j_maxlen; + irec.fmr_length = journal->j_total_len; irec.fmr_owner = EXT4_FMR_OWN_LOG; irec.fmr_flags = 0; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 81a545fd14a3..a42ca95840f2 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -143,7 +143,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) ret = -EROFS; goto out; } diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 2924261226e0..a92eb79de0cc 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -275,7 +275,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo) { #ifdef CONFIG_UNICODE - const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding; + const struct unicode_map *um = dir->i_sb->s_encoding; int r, dlen; unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index caa51473207d..b41512d1badc 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1880,6 +1880,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { + ext4_write_unlock_xattr(inode, &no_expand); *has_inline = 0; ext4_journal_stop(handle); return 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 03c2253005f0..0d8385aea898 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -327,6 +327,8 @@ stop_handle: ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: + if (!list_empty(&EXT4_I(inode)->i_fc_list)) + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } @@ -730,7 +732,7 @@ out_sem: if (ret) return ret; } - ext4_fc_track_range(inode, map->m_lblk, + ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); } @@ -1918,7 +1920,7 @@ static int __ext4_journalled_writepage(struct page *page, } if (ret == 0) ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, 0, len); + err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -2440,7 +2442,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, struct super_block *sb = inode->i_sb; if (ext4_forced_shutdown(EXT4_SB(sb)) || - EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2674,7 +2676,7 @@ static int ext4_writepages(struct address_space *mapping, * the stack trace. */ if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || - sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { ret = -EROFS; goto out_writepages; } @@ -3307,10 +3309,11 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) if (journal) { if (jbd2_transaction_committed(journal, - EXT4_I(inode)->i_datasync_tid)) - return true; - return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) >= - EXT4_I(inode)->i_fc_committed_subtid; + EXT4_I(inode)->i_datasync_tid)) + return false; + if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) + return !list_empty(&EXT4_I(inode)->i_fc_list); + return true; } /* Any metadata buffers to write? */ @@ -4107,7 +4110,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, first_block, stop_block); if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -5440,14 +5443,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (shrink) - ext4_fc_track_range(inode, + ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, (oldsize > 0 ? oldsize - 1 : 0) >> inode->i_sb->s_blocksize_bits); else ext4_fc_track_range( - inode, + handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> @@ -5697,7 +5700,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, put_bh(iloc->bh); return -EIO; } - ext4_fc_track_inode(inode); + ext4_fc_track_inode(handle, inode); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); @@ -6157,7 +6160,8 @@ retry_alloc: if (ext4_walk_page_buffers(handle, page_buffers(page), 0, len, NULL, write_end_fn)) goto out_error; - if (ext4_jbd2_inode_add_write(handle, inode, 0, len)) + if (ext4_jbd2_inode_add_write(handle, inode, + page_offset(page), len)) goto out_error; ext4_set_inode_state(inode, EXT4_STATE_JDATA); } else { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 85abbfb98cbe..24af9ed5c3e5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4477,7 +4477,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) return; ngroups = ext4_get_groups_count(sb); @@ -4508,7 +4508,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) return; mb_debug(sb, "Can't allocate:" @@ -5167,7 +5167,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, struct super_block *sb = ar->inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; - int i; + int i = sb->s_blocksize; ext4_fsblk_t goal, block; struct ext4_super_block *es = EXT4_SB(sb)->s_es; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5159830dacb8..33509266f5a0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1285,8 +1285,8 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) int ext4_ci_compare(const struct inode *parent, const struct qstr *name, const struct qstr *entry, bool quick) { - const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb); - const struct unicode_map *um = sbi->s_encoding; + const struct super_block *sb = parent->i_sb; + const struct unicode_map *um = sb->s_encoding; int ret; if (quick) @@ -1298,7 +1298,7 @@ int ext4_ci_compare(const struct inode *parent, const struct qstr *name, /* Handle invalid character sequence as either an error * or as an opaque byte sequence. */ - if (ext4_has_strict_mode(sbi)) + if (sb_has_strict_encoding(sb)) return -EINVAL; if (name->len != entry->len) @@ -1315,7 +1315,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, { int len; - if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) { + if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) { cf_name->name = NULL; return; } @@ -1324,7 +1324,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, if (!cf_name->name) return; - len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding, + len = utf8_casefold(dir->i_sb->s_encoding, iname, cf_name->name, EXT4_NAME_LEN); if (len <= 0) { @@ -1361,7 +1361,7 @@ static inline bool ext4_match(const struct inode *parent, #endif #ifdef CONFIG_UNICODE - if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) { + if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent)) { if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, .len = fname->cf_name.len}; @@ -2180,9 +2180,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; -#ifdef CONFIG_UNICODE - struct ext4_sb_info *sbi; -#endif struct ext4_filename fname; int retval; int dx_fallback=0; @@ -2199,9 +2196,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return -EINVAL; #ifdef CONFIG_UNICODE - sbi = EXT4_SB(sb); - if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && - sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) + if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && + sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; #endif @@ -2610,7 +2606,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; - struct inode *inode, *inode_save; + struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); @@ -2628,11 +2624,9 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - inode_save = inode; - ihold(inode_save); err = ext4_add_nondir(handle, dentry, &inode); - ext4_fc_track_create(inode_save, dentry); - iput(inode_save); + if (!err) + ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); @@ -2647,7 +2641,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; - struct inode *inode, *inode_save; + struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); @@ -2664,12 +2658,9 @@ retry: if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; - inode_save = inode; - ihold(inode_save); err = ext4_add_nondir(handle, dentry, &inode); if (!err) - ext4_fc_track_create(inode_save, dentry); - iput(inode_save); + ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); @@ -2833,7 +2824,6 @@ out_clear_inode: iput(inode); goto out_retry; } - ext4_fc_track_create(inode, dentry); ext4_inc_count(dir); ext4_update_dx_flag(dir); @@ -2841,6 +2831,7 @@ out_clear_inode: if (err) goto out_clear_inode; d_instantiate_new(dentry, inode); + ext4_fc_track_create(handle, dentry); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); @@ -3175,7 +3166,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; ext4_dec_count(dir); ext4_update_dx_flag(dir); - ext4_fc_track_unlink(inode, dentry); + ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); #ifdef CONFIG_UNICODE @@ -3196,13 +3187,12 @@ end_rmdir: return retval; } -int __ext4_unlink(struct inode *dir, const struct qstr *d_name, +int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, struct inode *inode) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; - handle_t *handle = NULL; int skip_remove_dentry = 0; bh = ext4_find_entry(dir, d_name, &de, NULL); @@ -3221,14 +3211,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else - goto out_bh; - } - - handle = ext4_journal_start(dir, EXT4_HT_DIR, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) { - retval = PTR_ERR(handle); - goto out_bh; + goto out; } if (IS_DIRSYNC(dir)) @@ -3237,12 +3220,12 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) - goto out_handle; + goto out; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) - goto out_handle; + goto out; } else { retval = 0; } @@ -3256,15 +3239,14 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, inode->i_ctime = current_time(inode); retval = ext4_mark_inode_dirty(handle, inode); -out_handle: - ext4_journal_stop(handle); -out_bh: +out: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { + handle_t *handle; int retval; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) @@ -3282,9 +3264,16 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (retval) goto out_trace; - retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry)); + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto out_trace; + } + + retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); if (!retval) - ext4_fc_track_unlink(d_inode(dentry), dentry); + ext4_fc_track_unlink(handle, dentry); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -3295,6 +3284,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif + if (handle) + ext4_journal_stop(handle); out_trace: trace_ext4_unlink_exit(dentry, retval); @@ -3451,7 +3442,6 @@ retry: err = ext4_add_entry(handle, dentry, inode); if (!err) { - ext4_fc_track_link(inode, dentry); err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time @@ -3459,6 +3449,7 @@ retry: if (inode->i_nlink == 1) ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); + ext4_fc_track_link(handle, dentry); } else { drop_nlink(inode); iput(inode); @@ -3919,9 +3910,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_FC_REASON_RENAME_DIR); } else { if (new.inode) - ext4_fc_track_unlink(new.inode, new.dentry); - ext4_fc_track_link(old.inode, new.dentry); - ext4_fc_track_unlink(old.inode, old.dentry); + ext4_fc_track_unlink(handle, new.dentry); + __ext4_fc_track_link(handle, old.inode, new.dentry); + __ext4_fc_track_unlink(handle, old.inode, old.dentry); } if (new.inode) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2fe141ff3c7e..6633b20224d5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -289,18 +289,7 @@ void ext4_superblock_csum_set(struct super_block *sb) if (!ext4_has_metadata_csum(sb)) return; - /* - * Locking the superblock prevents the scenario - * where: - * 1) a first thread pauses during checksum calculation. - * 2) a second thread updates the superblock, recalculates - * the checksum, and updates s_checksum - * 3) the first thread resumes and finishes its checksum calculation - * and updates s_checksum with a potentially stale or torn value. - */ - lock_buffer(EXT4_SB(sb)->s_sbh); es->s_checksum = ext4_superblock_csum(sb, es); - unlock_buffer(EXT4_SB(sb)->s_sbh); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -686,7 +675,7 @@ static void ext4_handle_error(struct super_block *sb) if (!test_opt(sb, ERRORS_CONT)) { journal_t *journal = EXT4_SB(sb)->s_journal; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); if (journal) jbd2_journal_abort(journal, -EIO); } @@ -904,7 +893,7 @@ void __ext4_abort(struct super_block *sb, const char *function, va_end(args); if (sb_rdonly(sb) == 0) { - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); @@ -1288,7 +1277,7 @@ static void ext4_put_super(struct super_block *sb) fs_put_dax(sbi->s_daxdev); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif kfree(sbi); } @@ -1716,11 +1705,10 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_prefetch_block_bitmaps, Opt_no_fc, + Opt_prefetch_block_bitmaps, #ifdef CONFIG_EXT4_DEBUG - Opt_fc_debug_max_replay, + Opt_fc_debug_max_replay, Opt_fc_debug_force #endif - Opt_fc_debug_force }; static const match_table_t tokens = { @@ -1807,9 +1795,8 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, - {Opt_no_fc, "no_fc"}, - {Opt_fc_debug_force, "fc_debug_force"}, #ifdef CONFIG_EXT4_DEBUG + {Opt_fc_debug_force, "fc_debug_force"}, {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"}, #endif {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, @@ -2027,8 +2014,8 @@ static const struct mount_opts { {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, - {Opt_usrjquota, 0, MOPT_Q}, - {Opt_grpjquota, 0, MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, {Opt_offusrjquota, 0, MOPT_Q}, {Opt_offgrpjquota, 0, MOPT_Q}, {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, @@ -2039,11 +2026,9 @@ static const struct mount_opts { {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, - {Opt_no_fc, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, - MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY}, +#ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, -#ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, #endif {Opt_err, 0, 0} @@ -2153,7 +2138,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); return 1; case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); return 1; case Opt_i_version: sb->s_flags |= SB_I_VERSION; @@ -3976,7 +3961,7 @@ int ext4_calculate_overhead(struct super_block *sb) * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev) - overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); @@ -4303,7 +4288,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; #ifdef CONFIG_UNICODE - if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) { + if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags; @@ -4334,15 +4319,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "%s-%s with flags 0x%hx", encoding_info->name, encoding_info->version?:"\b", encoding_flags); - sbi->s_encoding = encoding; - sbi->s_encoding_flags = encoding_flags; + sb->s_encoding = encoding; + sb->s_encoding_flags = encoding_flags; } #endif if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); + clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); @@ -4777,8 +4763,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; - sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE; - sbi->s_mount_state &= ~EXT4_FC_COMMITTING; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; @@ -4857,6 +4843,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { + ext4_msg(sb, KERN_ERR, + "Failed to set fast commit journal feature"); + goto failed_mount_wq; + } + /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { @@ -4975,7 +4969,7 @@ no_journal: } #ifdef CONFIG_UNICODE - if (sbi->s_encoding) + if (sb->s_encoding) sb->s_d_op = &ext4_dentry_ops; #endif @@ -5184,7 +5178,7 @@ failed_mount: crypto_free_shash(sbi->s_chksum_driver); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_QUOTA @@ -5872,7 +5866,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -5886,7 +5880,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { err = -EROFS; goto restore_opts; } @@ -6560,10 +6554,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, brelse(bh); out: if (inode->i_size < off + len) { - ext4_fc_track_range(inode, - (inode->i_size > 0 ? inode->i_size - 1 : 0) - >> inode->i_sb->s_blocksize_bits, - (off + len) >> inode->i_sb->s_blocksize_bits); i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; err2 = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 5ff33d18996a..4e27fe6ed3ae 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -315,6 +315,7 @@ EXT4_ATTR_FEATURE(casefold); EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); +EXT4_ATTR_FEATURE(fast_commit); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), @@ -331,6 +332,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(metadata_csum_seed), + ATTR_LIST(fast_commit), NULL, }; ATTRIBUTE_GROUPS(ext4_feat); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9cd2ecad07db..cc4f987687f3 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -77,7 +77,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, if (error) return error; if (!buffer_mapped(bh_result)) - return -EIO; + return -ENODATA; return 0; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8dff9cbd0a87..62d9081d1e26 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1301,12 +1301,8 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, trace_gfs2_bmap(ip, bh_map, lblock, create, 1); ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp); - if (!ret && iomap.type == IOMAP_HOLE) { - if (create) - ret = gfs2_iomap_alloc(inode, &iomap, &mp); - else - ret = -ENODATA; - } + if (create && !ret && iomap.type == IOMAP_HOLE) + ret = gfs2_iomap_alloc(inode, &iomap, &mp); release_metapath(&mp); if (ret) goto out; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 5441c17562c5..d98a2e5dab9f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1078,7 +1078,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, out_free: kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(cachep, gl); - atomic_dec(&sdp->sd_glock_disposal); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); out: return ret; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index aa3f5236befb..6c1432d78dce 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -165,6 +165,31 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) } /** + * gfs2_rgrp_metasync - sync out the metadata of a resource group + * @gl: the glock protecting the resource group + * + */ + +static int gfs2_rgrp_metasync(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct address_space *metamapping = &sdp->sd_aspace; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; + loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; + int error; + + filemap_fdatawrite_range(metamapping, start, end); + error = filemap_fdatawait_range(metamapping, start, end); + WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); + mapping_set_error(metamapping, error); + if (error) + gfs2_io_error(sdp); + return error; +} + +/** * rgrp_go_sync - sync out the metadata for this glock * @gl: the glock * @@ -176,11 +201,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) static int rgrp_go_sync(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); - const unsigned bsize = sdp->sd_sb.sb_bsize; - loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; - loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -189,10 +210,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl) gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_RGRP_GO_SYNC); - filemap_fdatawrite_range(mapping, start, end); - error = filemap_fdatawait_range(mapping, start, end); - WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); - mapping_set_error(mapping, error); + error = gfs2_rgrp_metasync(gl); if (!error) error = gfs2_ail_empty_gl(gl); gfs2_free_clones(rgd); @@ -266,7 +284,24 @@ static void gfs2_clear_glop_pending(struct gfs2_inode *ip) } /** - * inode_go_sync - Sync the dirty data and/or metadata for an inode glock + * gfs2_inode_metasync - sync out the metadata of an inode + * @gl: the glock protecting the inode + * + */ +int gfs2_inode_metasync(struct gfs2_glock *gl) +{ + struct address_space *metamapping = gfs2_glock2aspace(gl); + int error; + + filemap_fdatawrite(metamapping); + error = filemap_fdatawait(metamapping); + if (error) + gfs2_io_error(gl->gl_name.ln_sbd); + return error; +} + +/** + * inode_go_sync - Sync the dirty metadata of an inode * @gl: the glock protecting the inode * */ @@ -297,8 +332,7 @@ static int inode_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait(mapping); mapping_set_error(mapping, error); } - ret = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, ret); + ret = gfs2_inode_metasync(gl); if (!error) error = ret; gfs2_ail_empty_gl(gl); diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index 2dd192e85618..695898afcaf1 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -22,6 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; extern const struct gfs2_glock_operations *gfs2_glops_list[]; +extern int gfs2_inode_metasync(struct gfs2_glock *gl); extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6774865f5b5b..077ccb1b3ccc 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -180,7 +180,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail; - gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); + if (blktype != GFS2_BLKST_UNLINKED) + gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9133b3178677..2e9314091c81 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -132,6 +132,8 @@ __acquires(&sdp->sd_ail_lock) spin_unlock(&sdp->sd_ail_lock); ret = generic_writepages(mapping, wbc); spin_lock(&sdp->sd_ail_lock); + if (ret == -ENODATA) /* if a jdata write into a new hole */ + ret = 0; /* ignore it */ if (ret || wbc->nr_to_write <= 0) break; return -EBUSY; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ed69298dd824..3922b26264f5 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -22,6 +22,7 @@ #include "incore.h" #include "inode.h" #include "glock.h" +#include "glops.h" #include "log.h" #include "lops.h" #include "meta_io.h" @@ -817,41 +818,19 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, return error; } -/** - * gfs2_meta_sync - Sync all buffers associated with a glock - * @gl: The glock - * - */ - -void gfs2_meta_sync(struct gfs2_glock *gl) -{ - struct address_space *mapping = gfs2_glock2aspace(gl); - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - int error; - - if (mapping == NULL) - mapping = &sdp->sd_aspace; - - filemap_fdatawrite(mapping); - error = filemap_fdatawait(mapping); - - if (error) - gfs2_io_error(gl->gl_name.ln_sbd); -} - static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); if (error) { - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); return; } if (pass != 1) return; - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); @@ -1060,14 +1039,14 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); if (error) { - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); return; } if (pass != 1) return; /* data sync? */ - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 4a3d8aecdf82..fbdbb08dcec6 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,8 +27,6 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); -extern void gfs2_meta_sync(struct gfs2_glock *gl); - static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { unsigned int limit; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7a7e3c10a9a9..61fce59cb4d3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -633,8 +633,10 @@ static int init_statfs(struct gfs2_sbd *sdp) if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); - goto fail; + goto out; } + if (sdp->sd_args.ar_spectator) + goto out; pn = gfs2_lookup_simple(master, "per_node"); if (IS_ERR(pn)) { @@ -682,15 +684,17 @@ free_local: iput(pn); put_statfs: iput(sdp->sd_statfs_inode); -fail: +out: return error; } /* Uninitialize and free up memory used by the list of statfs inodes */ static void uninit_statfs(struct gfs2_sbd *sdp) { - gfs2_glock_dq_uninit(&sdp->sd_sc_gh); - free_local_statfs_inodes(sdp); + if (!sdp->sd_args.ar_spectator) { + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + free_local_statfs_inodes(sdp); + } iput(sdp->sd_statfs_inode); } @@ -704,7 +708,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (undo) { jindex = 0; - goto fail_jinode_gh; + goto fail_statfs; } sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index b5cbe21efdfb..c26c68ebd29d 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -349,7 +349,7 @@ static int update_statfs_inode(struct gfs2_jdesc *jd, mark_buffer_dirty(bh); brelse(bh); - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); out: return error; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ee491bb9c1cc..f7addc6197ed 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -719,9 +719,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) } gfs2_free_clones(rgd); + return_all_reservations(rgd); kfree(rgd->rd_bits); rgd->rd_bits = NULL; - return_all_reservations(rgd); kmem_cache_free(gfs2_rgrpd_cachep, rgd); } } @@ -1370,6 +1370,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + return -EROFS; + if (!blk_queue_discard(q)) return -EOPNOTSUPP; @@ -2526,13 +2529,13 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) rbm.rgd = rgd; error = gfs2_rbm_from_block(&rbm, no_addr); - if (WARN_ON_ONCE(error)) - goto fail; - - if (gfs2_testbit(&rbm, false) != type) - error = -ESTALE; + if (!WARN_ON_ONCE(error)) { + if (gfs2_testbit(&rbm, false) != type) + error = -ESTALE; + } gfs2_glock_dq_uninit(&rgd_gh); + fail: return error; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b285192bd6b3..b3d951ab8068 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -738,6 +738,7 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); + truncate_inode_pages_final(&sdp->sd_aspace); gfs2_delete_debugfs_file(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index dcc2aab1b2c4..4ba45caf5939 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -60,7 +60,7 @@ struct hfs_bnode { wait_queue_head_t lock_wq; atomic_t refcnt; unsigned int page_offset; - struct page *page[0]; + struct page *page[]; }; #define HFS_BNODE_ERROR 0 diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 3b03fff68543..a92de5199ec3 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -117,7 +117,7 @@ struct hfs_bnode { wait_queue_head_t lock_wq; atomic_t refcnt; unsigned int page_offset; - struct page *page[0]; + struct page *page[]; }; #define HFS_BNODE_LOCK 0 diff --git a/fs/io-wq.c b/fs/io-wq.c index 02894df7656d..b53c055bea6a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -482,6 +482,10 @@ static void io_impersonate_work(struct io_worker *worker, current->files = work->identity->files; current->nsproxy = work->identity->nsproxy; task_unlock(current); + if (!work->identity->files) { + /* failed grabbing files, ensure work gets cancelled */ + work->flags |= IO_WQ_WORK_CANCEL; + } } if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) current->fs = work->identity->fs; diff --git a/fs/io_uring.c b/fs/io_uring.c index b42dfa0243bf..c77584de68d7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -995,20 +995,33 @@ static void io_sq_thread_drop_mm(void) if (mm) { kthread_unuse_mm(mm); mmput(mm); + current->mm = NULL; } } static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { - if (!current->mm) { - if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || - !ctx->sqo_task->mm || - !mmget_not_zero(ctx->sqo_task->mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_task->mm); + struct mm_struct *mm; + + if (current->mm) + return 0; + + /* Should never happen */ + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL))) + return -EFAULT; + + task_lock(ctx->sqo_task); + mm = ctx->sqo_task->mm; + if (unlikely(!mm || !mmget_not_zero(mm))) + mm = NULL; + task_unlock(ctx->sqo_task); + + if (mm) { + kthread_use_mm(mm); + return 0; } - return 0; + return -EFAULT; } static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, @@ -1274,9 +1287,12 @@ static bool io_identity_cow(struct io_kiocb *req) /* add one for this request */ refcount_inc(&id->count); - /* drop old identity, assign new one. one ref for req, one for tctx */ - if (req->work.identity != tctx->identity && - refcount_sub_and_test(2, &req->work.identity->count)) + /* drop tctx and req identity references, if needed */ + if (tctx->identity != &tctx->__identity && + refcount_dec_and_test(&tctx->identity->count)) + kfree(tctx->identity); + if (req->work.identity != &tctx->__identity && + refcount_dec_and_test(&req->work.identity->count)) kfree(req->work.identity); req->work.identity = id; @@ -1365,6 +1381,9 @@ static void io_prep_async_work(struct io_kiocb *req) io_req_init_async(req); id = req->work.identity; + if (req->flags & REQ_F_FORCE_ASYNC) + req->work.flags |= IO_WQ_WORK_CONCURRENT; + if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); @@ -1574,14 +1593,29 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) } } -static inline bool io_match_files(struct io_kiocb *req, - struct files_struct *files) +static inline bool __io_match_files(struct io_kiocb *req, + struct files_struct *files) { + return ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_FILES)) && + req->work.identity->files == files; +} + +static bool io_match_files(struct io_kiocb *req, + struct files_struct *files) +{ + struct io_kiocb *link; + if (!files) return true; - if ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_FILES)) - return req->work.identity->files == files; + if (__io_match_files(req, files)) + return true; + if (req->flags & REQ_F_LINK_HEAD) { + list_for_each_entry(link, &req->link_list, link_list) { + if (__io_match_files(link, files)) + return true; + } + } return false; } @@ -1665,7 +1699,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { + } else if (ctx->cq_overflow_flushed || + atomic_read(&req->task->io_uring->in_idle)) { /* * If we're in ring overflow flush mode, or in task cancel mode, * then we cannot store the request for later flushing, we need @@ -1835,7 +1870,7 @@ static void __io_free_req(struct io_kiocb *req) io_dismantle_req(req); percpu_counter_dec(&tctx->inflight); - if (tctx->in_idle) + if (atomic_read(&tctx->in_idle)) wake_up(&tctx->wait); put_task_struct(req->task); @@ -1846,59 +1881,39 @@ static void __io_free_req(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } -static bool io_link_cancel_timeout(struct io_kiocb *req) +static void io_kill_linked_timeout(struct io_kiocb *req) { - struct io_timeout_data *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = hrtimer_try_to_cancel(&io->timer); - if (ret != -1) { - io_cqring_fill_event(req, -ECANCELED); - io_commit_cqring(ctx); - req->flags &= ~REQ_F_LINK_HEAD; - io_put_req_deferred(req, 1); - return true; - } - - return false; -} - -static bool __io_kill_linked_timeout(struct io_kiocb *req) -{ struct io_kiocb *link; - bool wake_ev; + bool cancelled = false; + unsigned long flags; - if (list_empty(&req->link_list)) - return false; - link = list_first_entry(&req->link_list, struct io_kiocb, link_list); - if (link->opcode != IORING_OP_LINK_TIMEOUT) - return false; + spin_lock_irqsave(&ctx->completion_lock, flags); + link = list_first_entry_or_null(&req->link_list, struct io_kiocb, + link_list); /* * Can happen if a linked timeout fired and link had been like * req -> link t-out -> link t-out [-> ...] */ - if (!(link->flags & REQ_F_LTIMEOUT_ACTIVE)) - return false; + if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) { + struct io_timeout_data *io = link->async_data; + int ret; - list_del_init(&link->link_list); - wake_ev = io_link_cancel_timeout(link); + list_del_init(&link->link_list); + ret = hrtimer_try_to_cancel(&io->timer); + if (ret != -1) { + io_cqring_fill_event(link, -ECANCELED); + io_commit_cqring(ctx); + cancelled = true; + } + } req->flags &= ~REQ_F_LINK_TIMEOUT; - return wake_ev; -} - -static void io_kill_linked_timeout(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - bool wake_ev; - - spin_lock_irqsave(&ctx->completion_lock, flags); - wake_ev = __io_kill_linked_timeout(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); - if (wake_ev) + if (cancelled) { io_cqring_ev_posted(ctx); + io_put_req(link); + } } static struct io_kiocb *io_req_link_next(struct io_kiocb *req) @@ -4977,8 +4992,10 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, /* make sure double remove sees this as being gone */ wait->private = NULL; spin_unlock(&poll->head->lock); - if (!done) - __io_async_wake(req, poll, mask, io_poll_task_func); + if (!done) { + /* use wait func handler, so it matches the rq type */ + poll->wait.func(&poll->wait, mode, sync, key); + } } refcount_dec(&req->refs); return 1; @@ -6180,7 +6197,6 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs) { struct io_kiocb *linked_timeout; - struct io_kiocb *nxt; const struct cred *old_creds = NULL; int ret; @@ -6206,7 +6222,6 @@ again: */ if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { if (!io_arm_poll_handler(req)) { -punt: /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. @@ -6216,33 +6231,25 @@ punt: if (linked_timeout) io_queue_linked_timeout(linked_timeout); - goto exit; - } + } else if (likely(!ret)) { + /* drop submission reference */ + req = io_put_req_find_next(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); - if (unlikely(ret)) { + if (req) { + if (!(req->flags & REQ_F_FORCE_ASYNC)) + goto again; + io_queue_async_work(req); + } + } else { /* un-prep timeout, so it'll be killed as any other linked */ req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); io_put_req(req); io_req_complete(req, ret); - goto exit; } - /* drop submission reference */ - nxt = io_put_req_find_next(req); - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - - if (nxt) { - req = nxt; - - if (req->flags & REQ_F_FORCE_ASYNC) { - linked_timeout = NULL; - goto punt; - } - goto again; - } -exit: if (old_creds) revert_creds(old_creds); } @@ -6266,13 +6273,6 @@ fail_req: if (unlikely(ret)) goto fail_req; } - - /* - * Never try inline submit of IOSQE_ASYNC is set, go straight - * to async execution. - */ - io_req_init_async(req); - req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { if (sqe) { @@ -7727,7 +7727,8 @@ static int io_uring_alloc_task_context(struct task_struct *task) xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); tctx->last = NULL; - tctx->in_idle = 0; + atomic_set(&tctx->in_idle, 0); + tctx->sqpoll = false; io_init_identity(&tctx->__identity); tctx->identity = &tctx->__identity; task->io_uring = tctx; @@ -8420,22 +8421,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) return false; } -static bool io_match_link_files(struct io_kiocb *req, - struct files_struct *files) -{ - struct io_kiocb *link; - - if (io_match_files(req, files)) - return true; - if (req->flags & REQ_F_LINK_HEAD) { - list_for_each_entry(link, &req->link_list, link_list) { - if (io_match_files(link, files)) - return true; - } - } - return false; -} - /* * We're looking to cancel 'req' because it's holding on to our files, but * 'req' could be a link to another request. See if it is, and cancel that @@ -8485,7 +8470,21 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx, static bool io_cancel_link_cb(struct io_wq_work *work, void *data) { - return io_match_link(container_of(work, struct io_kiocb, work), data); + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + bool ret; + + if (req->flags & REQ_F_LINK_TIMEOUT) { + unsigned long flags; + struct io_ring_ctx *ctx = req->ctx; + + /* protect against races with linked timeouts */ + spin_lock_irqsave(&ctx->completion_lock, flags); + ret = io_match_link(req, data); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + ret = io_match_link(req, data); + } + return ret; } static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) @@ -8511,6 +8510,7 @@ static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) } static void io_cancel_defer_files(struct io_ring_ctx *ctx, + struct task_struct *task, struct files_struct *files) { struct io_defer_entry *de = NULL; @@ -8518,7 +8518,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_link_files(de->req, files)) { + if (io_task_match(de->req, task) && + io_match_files(de->req, files)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -8544,7 +8545,6 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, if (list_empty_careful(&ctx->inflight_list)) return false; - io_cancel_defer_files(ctx, files); /* cancel all at once, should be faster than doing it one by one*/ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); @@ -8630,8 +8630,16 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, { struct task_struct *task = current; - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { task = ctx->sq_data->thread; + atomic_inc(&task->io_uring->in_idle); + io_sq_thread_park(ctx->sq_data); + } + + if (files) + io_cancel_defer_files(ctx, NULL, files); + else + io_cancel_defer_files(ctx, task, NULL); io_cqring_overflow_flush(ctx, true, task, files); @@ -8639,12 +8647,23 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_run_task_work(); cond_resched(); } + + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + atomic_dec(&task->io_uring->in_idle); + /* + * If the files that are going away are the ones in the thread + * identity, clear them out. + */ + if (task->io_uring->identity->files == files) + task->io_uring->identity->files = NULL; + io_sq_thread_unpark(ctx->sq_data); + } } /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static int io_uring_add_task_file(struct file *file) +static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) { struct io_uring_task *tctx = current->io_uring; @@ -8666,6 +8685,14 @@ static int io_uring_add_task_file(struct file *file) tctx->last = file; } + /* + * This is race safe in that the task itself is doing this, hence it + * cannot be going through the exit/cancel paths at the same time. + * This cannot be modified while exit/cancel is running. + */ + if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL)) + tctx->sqpoll = true; + return 0; } @@ -8707,7 +8734,7 @@ void __io_uring_files_cancel(struct files_struct *files) unsigned long index; /* make sure overflow events are dropped */ - tctx->in_idle = true; + atomic_inc(&tctx->in_idle); xa_for_each(&tctx->xa, index, file) { struct io_ring_ctx *ctx = file->private_data; @@ -8716,6 +8743,35 @@ void __io_uring_files_cancel(struct files_struct *files) if (files) io_uring_del_task_file(file); } + + atomic_dec(&tctx->in_idle); +} + +static s64 tctx_inflight(struct io_uring_task *tctx) +{ + unsigned long index; + struct file *file; + s64 inflight; + + inflight = percpu_counter_sum(&tctx->inflight); + if (!tctx->sqpoll) + return inflight; + + /* + * If we have SQPOLL rings, then we need to iterate and find them, and + * add the pending count for those. + */ + xa_for_each(&tctx->xa, index, file) { + struct io_ring_ctx *ctx = file->private_data; + + if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_uring_task *__tctx = ctx->sqo_task->io_uring; + + inflight += percpu_counter_sum(&__tctx->inflight); + } + } + + return inflight; } /* @@ -8729,11 +8785,11 @@ void __io_uring_task_cancel(void) s64 inflight; /* make sure overflow events are dropped */ - tctx->in_idle = true; + atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ - inflight = percpu_counter_sum(&tctx->inflight); + inflight = tctx_inflight(tctx); if (!inflight) break; __io_uring_files_cancel(NULL); @@ -8744,13 +8800,13 @@ void __io_uring_task_cancel(void) * If we've seen completions, retry. This avoids a race where * a completion comes in before we did prepare_to_wait(). */ - if (inflight != percpu_counter_sum(&tctx->inflight)) + if (inflight != tctx_inflight(tctx)) continue; schedule(); } while (1); finish_wait(&tctx->wait, &wait); - tctx->in_idle = false; + atomic_dec(&tctx->in_idle); } static int io_uring_flush(struct file *file, void *data) @@ -8895,7 +8951,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_sqpoll_wait_sq(ctx); submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_task_file(f.file); + ret = io_uring_add_task_file(ctx, f.file); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -8932,7 +8988,8 @@ out_fput: #ifdef CONFIG_PROC_FS static int io_uring_show_cred(int id, void *p, void *data) { - const struct cred *cred = p; + struct io_identity *iod = p; + const struct cred *cred = iod->creds; struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; @@ -9124,7 +9181,7 @@ err_fd: #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; #endif - if (unlikely(io_uring_add_task_file(file))) { + if (unlikely(io_uring_add_task_file(ctx, file))) { file = ERR_PTR(-ENOMEM); goto err_fd; } @@ -9169,6 +9226,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ + p->cq_entries = roundup_pow_of_two(p->cq_entries); if (p->cq_entries < p->sq_entries) return -EINVAL; if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { @@ -9176,7 +9234,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, return -EINVAL; p->cq_entries = IORING_MAX_CQ_ENTRIES; } - p->cq_entries = roundup_pow_of_two(p->cq_entries); } else { p->cq_entries = 2 * p->sq_entries; } diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h index 1558cf22ef8a..ee9660e9671c 100644 --- a/fs/isofs/rock.h +++ b/fs/isofs/rock.h @@ -22,7 +22,7 @@ struct SU_ER_s { __u8 len_des; __u8 len_src; __u8 ext_ver; - __u8 data[0]; + __u8 data[]; } __attribute__ ((packed)); struct RR_RR_s { @@ -44,7 +44,7 @@ struct RR_PN_s { struct SL_component { __u8 flags; __u8 len; - __u8 text[0]; + __u8 text[]; } __attribute__ ((packed)); struct RR_SL_s { @@ -54,7 +54,7 @@ struct RR_SL_s { struct RR_NM_s { __u8 flags; - char name[0]; + char name[]; } __attribute__ ((packed)); struct RR_CL_s { @@ -71,7 +71,7 @@ struct stamp { struct RR_TF_s { __u8 flags; - struct stamp times[0]; /* Variable number of these beasts */ + struct stamp times[]; /* Variable number of these beasts */ } __attribute__ ((packed)); /* Linux-specific extension for transparent decompression */ diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 263f02ad8ebf..472932b9e6bc 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -106,6 +106,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) * for a checkpoint to free up some space in the log. */ void __jbd2_log_wait_for_space(journal_t *journal) +__acquires(&journal->j_state_lock) +__releases(&journal->j_state_lock) { int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index fa688e163a80..b121d7d434c6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -450,6 +450,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) schedule(); write_lock(&journal->j_state_lock); finish_wait(&journal->j_fc_wait, &wait); + /* + * TODO: by blocking fast commits here, we are increasing + * fsync() latency slightly. Strictly speaking, we don't need + * to block fast commits until the transaction enters T_FLUSH + * state. So an optimization is possible where we block new fast + * commits here and wait for existing ones to complete + * just before we enter T_FLUSH. That way, the existing fast + * commits and this full commit can proceed parallely. + */ } write_unlock(&journal->j_state_lock); @@ -801,7 +810,7 @@ start_journal_io: if (first_block < journal->j_tail) freed += journal->j_last - journal->j_first; /* Update tail only if we free significant amount of space */ - if (freed < journal->j_maxlen / 4) + if (freed < jbd2_journal_get_max_txn_bufs(journal)) update_tail = 0; } J_ASSERT(commit_transaction->t_state == T_COMMIT); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0c7c42bd530f..0c3d5e3b24b2 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -727,6 +727,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) { + if (unlikely(is_journal_aborted(journal))) + return -EIO; /* * Fast commits only allowed if at least one full commit has * been processed. @@ -734,10 +736,12 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) if (!journal->j_stats.ts_tid) return -EINVAL; - if (tid <= journal->j_commit_sequence) + write_lock(&journal->j_state_lock); + if (tid <= journal->j_commit_sequence) { + write_unlock(&journal->j_state_lock); return -EALREADY; + } - write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING || (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) { DEFINE_WAIT(wait); @@ -777,13 +781,19 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) int jbd2_fc_end_commit(journal_t *journal) { - return __jbd2_fc_end_commit(journal, 0, 0); + return __jbd2_fc_end_commit(journal, 0, false); } EXPORT_SYMBOL(jbd2_fc_end_commit); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid) +int jbd2_fc_end_commit_fallback(journal_t *journal) { - return __jbd2_fc_end_commit(journal, tid, 1); + tid_t tid; + + read_lock(&journal->j_state_lock); + tid = journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0; + read_unlock(&journal->j_state_lock); + return __jbd2_fc_end_commit(journal, tid, true); } EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); @@ -865,7 +875,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) int fc_off; *bh_out = NULL; - write_lock(&journal->j_state_lock); if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) { fc_off = journal->j_fc_off; @@ -874,7 +883,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) } else { ret = -EINVAL; } - write_unlock(&journal->j_state_lock); if (ret) return ret; @@ -887,11 +895,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) if (!bh) return -ENOMEM; - lock_buffer(bh); - clear_buffer_uptodate(bh); - set_buffer_dirty(bh); - unlock_buffer(bh); journal->j_fc_wbuf[fc_off] = bh; *bh_out = bh; @@ -909,9 +913,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) struct buffer_head *bh; int i, j_fc_off; - read_lock(&journal->j_state_lock); j_fc_off = journal->j_fc_off; - read_unlock(&journal->j_state_lock); /* * Wait in reverse order to minimize chances of us being woken up before @@ -939,9 +941,7 @@ int jbd2_fc_release_bufs(journal_t *journal) struct buffer_head *bh; int i, j_fc_off; - read_lock(&journal->j_state_lock); j_fc_off = journal->j_fc_off; - read_unlock(&journal->j_state_lock); /* * Wait in reverse order to minimize chances of us being woken up before @@ -1348,23 +1348,16 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_dev = bdev; journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; - journal->j_maxlen = len; + journal->j_total_len = len; /* We need enough buffers to write out full descriptor block. */ n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; + journal->j_fc_wbuf = NULL; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); if (!journal->j_wbuf) goto err_cleanup; - if (journal->j_fc_wbufsize > 0) { - journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, - sizeof(struct buffer_head *), - GFP_KERNEL); - if (!journal->j_fc_wbuf) - goto err_cleanup; - } - bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); if (!bh) { pr_err("%s: Cannot get buffer for journal superblock\n", @@ -1378,23 +1371,11 @@ static journal_t *journal_init_common(struct block_device *bdev, err_cleanup: kfree(journal->j_wbuf); - kfree(journal->j_fc_wbuf); jbd2_journal_destroy_revoke(journal); kfree(journal); return NULL; } -int jbd2_fc_init(journal_t *journal, int num_fc_blks) -{ - journal->j_fc_wbufsize = num_fc_blks; - journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, - sizeof(struct buffer_head *), GFP_KERNEL); - if (!journal->j_fc_wbuf) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL(jbd2_fc_init); - /* jbd2_journal_init_dev and jbd2_journal_init_inode: * * Create a journal structure assigned some fixed set of disk blocks to @@ -1512,16 +1493,7 @@ static int journal_reset(journal_t *journal) } journal->j_first = first; - - if (jbd2_has_feature_fast_commit(journal) && - journal->j_fc_wbufsize > 0) { - journal->j_fc_last = last; - journal->j_last = last - journal->j_fc_wbufsize; - journal->j_fc_first = journal->j_last + 1; - journal->j_fc_off = 0; - } else { - journal->j_last = last; - } + journal->j_last = last; journal->j_head = journal->j_first; journal->j_tail = journal->j_first; @@ -1531,7 +1503,14 @@ static int journal_reset(journal_t *journal) journal->j_commit_sequence = journal->j_transaction_sequence - 1; journal->j_commit_request = journal->j_commit_sequence; - journal->j_max_transaction_buffers = journal->j_maxlen / 4; + journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); + + /* + * Now that journal recovery is done, turn fast commits off here. This + * way, if fast commit was enabled before the crash but if now FS has + * disabled it, we don't enable fast commits. + */ + jbd2_clear_feature_fast_commit(journal); /* * As a special case, if the on-disk copy is already marked as needing @@ -1792,15 +1771,15 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) - journal->j_maxlen = be32_to_cpu(sb->s_maxlen); - else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { printk(KERN_WARNING "JBD2: journal file too short\n"); goto out; } if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + be32_to_cpu(sb->s_first) >= journal->j_total_len) { printk(KERN_WARNING "JBD2: Invalid start block of journal: %u\n", be32_to_cpu(sb->s_first)); @@ -1872,6 +1851,7 @@ static int load_superblock(journal_t *journal) { int err; journal_superblock_t *sb; + int num_fc_blocks; err = journal_get_superblock(journal); if (err) @@ -1883,15 +1863,17 @@ static int load_superblock(journal_t *journal) journal->j_tail = be32_to_cpu(sb->s_start); journal->j_first = be32_to_cpu(sb->s_first); journal->j_errno = be32_to_cpu(sb->s_errno); + journal->j_last = be32_to_cpu(sb->s_maxlen); - if (jbd2_has_feature_fast_commit(journal) && - journal->j_fc_wbufsize > 0) { + if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize; + num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks); + if (!num_fc_blocks) + num_fc_blocks = JBD2_MIN_FC_BLOCKS; + if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) + journal->j_last = journal->j_fc_last - num_fc_blocks; journal->j_fc_first = journal->j_last + 1; journal->j_fc_off = 0; - } else { - journal->j_last = be32_to_cpu(sb->s_maxlen); } return 0; @@ -1954,9 +1936,6 @@ int jbd2_journal_load(journal_t *journal) */ journal->j_flags &= ~JBD2_ABORT; - if (journal->j_fc_wbufsize > 0) - jbd2_journal_set_features(journal, 0, 0, - JBD2_FEATURE_INCOMPAT_FAST_COMMIT); /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ @@ -2040,8 +2019,7 @@ int jbd2_journal_destroy(journal_t *journal) jbd2_journal_destroy_revoke(journal); if (journal->j_chksum_driver) crypto_free_shash(journal->j_chksum_driver); - if (journal->j_fc_wbufsize > 0) - kfree(journal->j_fc_wbuf); + kfree(journal->j_fc_wbuf); kfree(journal->j_wbuf); kfree(journal); @@ -2116,6 +2094,37 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp return 0; } +static int +jbd2_journal_initialize_fast_commit(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned long long num_fc_blks; + + num_fc_blks = be32_to_cpu(sb->s_num_fc_blks); + if (num_fc_blks == 0) + num_fc_blks = JBD2_MIN_FC_BLOCKS; + if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) + return -ENOSPC; + + /* Are we called twice? */ + WARN_ON(journal->j_fc_wbuf != NULL); + journal->j_fc_wbuf = kmalloc_array(num_fc_blks, + sizeof(struct buffer_head *), GFP_KERNEL); + if (!journal->j_fc_wbuf) + return -ENOMEM; + + journal->j_fc_wbufsize = num_fc_blks; + journal->j_fc_last = journal->j_last; + journal->j_last = journal->j_fc_last - num_fc_blks; + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + journal->j_free = journal->j_last - journal->j_first; + journal->j_max_transaction_buffers = + jbd2_journal_get_max_txn_bufs(journal); + + return 0; +} + /** * int jbd2_journal_set_features() - Mark a given journal feature in the superblock * @journal: Journal to act on. @@ -2159,6 +2168,13 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb = journal->j_superblock; + if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) { + if (jbd2_journal_initialize_fast_commit(journal)) { + pr_err("JBD2: Cannot enable fast commits.\n"); + return 0; + } + } + /* Load the checksum driver if necessary */ if ((journal->j_chksum_driver == NULL) && INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index eb2606133cd8..dc0694fcfcd1 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -74,8 +74,8 @@ static int do_readahead(journal_t *journal, unsigned int start) /* Do up to 128K of readahead */ max = start + (128 * 1024 / journal->j_blocksize); - if (max > journal->j_maxlen) - max = journal->j_maxlen; + if (max > journal->j_total_len) + max = journal->j_total_len; /* Do the readahead itself. We'll submit MAXBUF buffer_heads at * a time to the block device IO layer. */ @@ -134,7 +134,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, *bhp = NULL; - if (offset >= journal->j_maxlen) { + if (offset >= journal->j_total_len) { printk(KERN_ERR "JBD2: corrupted journal superblock\n"); return -EFSCORRUPTED; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 43985738aa86..d54f04674e8e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -195,8 +195,10 @@ static void wait_transaction_switching(journal_t *journal) DEFINE_WAIT(wait); if (WARN_ON(!journal->j_running_transaction || - journal->j_running_transaction->t_state != T_SWITCH)) + journal->j_running_transaction->t_state != T_SWITCH)) { + read_unlock(&journal->j_state_lock); return; + } prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); read_unlock(&journal->j_state_lock); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index cb52db9a0cfb..4e011adaf967 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -955,7 +955,6 @@ out: static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { - struct inode *inode = file_inode(filp); struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", @@ -967,15 +966,15 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) case SEEK_SET: if (offset < 0) return -EINVAL; - inode_lock(inode); + spin_lock(&filp->f_lock); break; case SEEK_CUR: if (offset == 0) return filp->f_pos; - inode_lock(inode); + spin_lock(&filp->f_lock); offset += filp->f_pos; if (offset < 0) { - inode_unlock(inode); + spin_unlock(&filp->f_lock); return -EINVAL; } } @@ -987,7 +986,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } - inode_unlock(inode); + spin_unlock(&filp->f_lock); return offset; } @@ -998,13 +997,9 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *inode = file_inode(filp); - dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); - inode_lock(inode); - nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - inode_unlock(inode); + nfs_inc_stats(file_inode(filp), NFSIOS_VFSFSYNC); return 0; } diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index b51424ff8159..6c2ce799150f 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -1047,8 +1047,10 @@ out4: void nfs4_xattr_cache_exit(void) { + unregister_shrinker(&nfs4_xattr_large_entry_shrinker); unregister_shrinker(&nfs4_xattr_entry_shrinker); unregister_shrinker(&nfs4_xattr_cache_shrinker); + list_lru_destroy(&nfs4_xattr_large_entry_lru); list_lru_destroy(&nfs4_xattr_entry_lru); list_lru_destroy(&nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 0dc31ad2362e..6e060a88f98c 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -196,7 +196,7 @@ 1 + nfs4_xattr_name_maxsz + 1) #define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) #define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1) -#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1) +#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + 1) #define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \ nfs4_xattr_name_maxsz) #define decode_removexattr_maxsz (op_decode_hdr_maxsz + \ @@ -531,7 +531,7 @@ static void encode_listxattrs(struct xdr_stream *xdr, { __be32 *p; - encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz + 1, hdr); + encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz, hdr); p = reserve_space(xdr, 12); if (unlikely(!p)) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 8d3278805602..fa148308822c 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -88,7 +88,13 @@ #define NFS_ROOT "/tftpboot/%s" /* Default NFSROOT mount options. */ +#if defined(CONFIG_NFS_V2) #define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096" +#elif defined(CONFIG_NFS_V3) +#define NFS_DEF_OPTIONS "vers=3,tcp,rsize=4096,wsize=4096" +#else +#define NFS_DEF_OPTIONS "vers=4,tcp,rsize=4096,wsize=4096" +#endif /* Parameters passed from the kernel command line */ static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = ""; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 14468613d150..a633044b0dc1 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -316,10 +316,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); - if (argp->ftype == 0 || argp->ftype >= NF3BAD) { - resp->status = nfserr_inval; - goto out; - } if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) { rdev = MKDEV(argp->major, argp->minor); if (MAJOR(rdev) != argp->major || @@ -328,7 +324,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) goto out; } } else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) { - resp->status = nfserr_inval; + resp->status = nfserr_badtype; goto out; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9c23b6acf234..2277f83da250 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1114,6 +1114,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_pathconfres *resp = rqstp->rq_resp; + *p++ = resp->status; *p++ = xdr_zero; /* no post_op_attr */ if (resp->status == 0) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ad2fa1a8e7ad..e83b21778816 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1299,7 +1299,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src, struct nfsd_file *dst) { nfs42_ssc_close(src->nf_file); - nfsd_file_put(src); + /* 'src' is freed by nfsd4_do_async_copy */ nfsd_file_put(dst); mntput(ss_mnt); } @@ -1486,6 +1486,7 @@ do_callback: cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); if (!cb_copy) goto out; + refcount_set(&cb_copy->refcount, 1); memcpy(&cb_copy->cp_res, ©->cp_res, sizeof(copy->cp_res)); cb_copy->cp_clp = copy->cp_clp; cb_copy->nfserr = copy->nfserr; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b9a9d69dde7e..db52e843002a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -877,7 +877,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) goto done; } - trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen); + trace_ocfs2_journal_init_maxlen(j_journal->j_total_len); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & OCFS2_JOURNAL_DIRTY_FL); diff --git a/fs/proc/base.c b/fs/proc/base.c index 0f707003dda5..b362523a9829 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1049,6 +1049,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX; put_task_struct(task); + if (oom_adj > OOM_ADJUST_MAX) + oom_adj = OOM_ADJUST_MAX; len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index d0989a443c77..419760fd77bd 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -19,7 +19,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file) static const struct proc_ops cpuinfo_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = seq_release, }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2f9fa179194d..b84663252add 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -590,7 +590,7 @@ static int proc_seq_release(struct inode *inode, struct file *file) static const struct proc_ops proc_seq_ops = { /* not permanent -- can call into arbitrary seq_operations */ .proc_open = proc_seq_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = proc_seq_release, }; @@ -621,7 +621,7 @@ static int proc_single_open(struct inode *inode, struct file *file) static const struct proc_ops proc_single_ops = { /* not permanent -- can call into arbitrary ->single_show */ .proc_open = proc_single_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 58c075e2a452..bde6b6f69852 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -597,6 +597,7 @@ static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, + .splice_read = generic_file_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, @@ -622,6 +623,7 @@ static const struct file_operations proc_reg_file_ops_compat = { static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, + .splice_read = generic_file_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 46b3293015fe..4695b6de3151 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -226,7 +226,7 @@ static int stat_open(struct inode *inode, struct file *file) static const struct proc_ops stat_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; diff --git a/fs/select.c b/fs/select.c index 7aef49552d4c..ebfebdfe5c69 100644 --- a/fs/select.c +++ b/fs/select.c @@ -97,7 +97,7 @@ u64 select_estimate_accuracy(struct timespec64 *tv) struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; - struct poll_table_entry entries[0]; + struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ @@ -836,7 +836,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) struct poll_list { struct poll_list *next; int len; - struct pollfd entries[0]; + struct pollfd entries[]; }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) diff --git a/fs/seq_file.c b/fs/seq_file.c index 31219c1db17d..3b20e21604e7 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/printk.h> #include <linux/string_helpers.h> +#include <linux/uio.h> #include <linux/uaccess.h> #include <asm/page.h> @@ -146,7 +147,28 @@ Eoverflow: */ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct seq_file *m = file->private_data; + struct iovec iov = { .iov_base = buf, .iov_len = size}; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + iov_iter_init(&iter, READ, &iov, 1, size); + + kiocb.ki_pos = *ppos; + ret = seq_read_iter(&kiocb, &iter); + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(seq_read); + +/* + * Ready-made ->f_op->read_iter() + */ +ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct seq_file *m = iocb->ki_filp->private_data; + size_t size = iov_iter_count(iter); size_t copied = 0; size_t n; void *p; @@ -158,14 +180,14 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ - if (*ppos == 0) { + if (iocb->ki_pos == 0) { m->index = 0; m->count = 0; } - /* Don't assume *ppos is where we left it */ - if (unlikely(*ppos != m->read_pos)) { - while ((err = traverse(m, *ppos)) == -EAGAIN) + /* Don't assume ki_pos is where we left it */ + if (unlikely(iocb->ki_pos != m->read_pos)) { + while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN) ; if (err) { /* With prejudice... */ @@ -174,7 +196,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) m->count = 0; goto Done; } else { - m->read_pos = *ppos; + m->read_pos = iocb->ki_pos; } } @@ -187,13 +209,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* if not empty - flush it first */ if (m->count) { n = min(m->count, size); - err = copy_to_user(buf, m->buf + m->from, n); - if (err) + if (copy_to_iter(m->buf + m->from, n, iter) != n) goto Efault; m->count -= n; m->from += n; size -= n; - buf += n; copied += n; if (!size) goto Done; @@ -254,8 +274,7 @@ Fill: } m->op->stop(m, p); n = min(m->count, size); - err = copy_to_user(buf, m->buf, n); - if (err) + if (copy_to_iter(m->buf, n, iter) != n) goto Efault; copied += n; m->count -= n; @@ -264,7 +283,7 @@ Done: if (!copied) copied = err; else { - *ppos += copied; + iocb->ki_pos += copied; m->read_pos += copied; } mutex_unlock(&m->lock); @@ -276,7 +295,7 @@ Efault: err = -EFAULT; goto Done; } -EXPORT_SYMBOL(seq_read); +EXPORT_SYMBOL(seq_read_iter); /** * seq_lseek - ->llseek() method for sequential files. |