diff options
Diffstat (limited to 'fs')
331 files changed, 5620 insertions, 3613 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index a547307c1ae8..2685a4d0d353 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -235,6 +235,7 @@ config ARCH_SUPPORTS_HUGETLBFS config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN + depends on (SYSFS || SYSCTL) help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index f14478643b91..93539aac0e5b 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on ARM || ((M68K || SUPERH) && !MMU) + depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load @@ -951,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx) local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { - int old, avail = atomic_read(&ctx->reqs_available); + int avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; - - old = avail; - avail = atomic_cmpxchg(&ctx->reqs_available, - avail, avail - ctx->req_batch); - } while (avail != old); + } while (!atomic_try_cmpxchg(&ctx->reqs_available, + &avail, avail - ctx->req_batch)); kcpu->reqs_available += ctx->req_batch; } diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 12b8fdcc445b..9d1cde8066cf 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -147,7 +147,7 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, } static int bad_inode_tmpfile(struct user_namespace *mnt_userns, - struct inode *inode, struct dentry *dentry, + struct inode *inode, struct file *file, umode_t mode) { return -EIO; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 63c7ebb0da89..6a11025e5850 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -911,7 +911,7 @@ static int load_elf_binary(struct linux_binprm *bprm) interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL); if (!interp_elf_ex) { retval = -ENOMEM; - goto out_free_ph; + goto out_free_file; } /* Get the exec headers */ @@ -1354,6 +1354,7 @@ out: out_free_dentry: kfree(interp_elf_ex); kfree(interp_elf_phdata); +out_free_file: allow_write_access(interpreter); if (interpreter) fput(interpreter); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index dce3a16996b9..18374a6d05bd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -138,6 +138,7 @@ struct share_check { u64 root_objectid; u64 inum; int share_count; + bool have_delayed_delete_refs; }; static inline int extent_is_shared(struct share_check *sc) @@ -288,8 +289,10 @@ static void prelim_release(struct preftree *preftree) struct prelim_ref *ref, *next_ref; rbtree_postorder_for_each_entry_safe(ref, next_ref, - &preftree->root.rb_root, rbnode) + &preftree->root.rb_root, rbnode) { + free_inode_elem_list(ref->inode_list); free_pref(ref); + } preftree->root = RB_ROOT_CACHED; preftree->count = 0; @@ -647,6 +650,18 @@ unode_aux_to_inode_list(struct ulist_node *node) return (struct extent_inode_elem *)(uintptr_t)node->aux; } +static void free_leaf_list(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(ulist, &uiter))) + free_inode_elem_list(unode_aux_to_inode_list(node)); + + ulist_free(ulist); +} + /* * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not @@ -761,7 +776,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, cond_resched(); } out: - ulist_free(parents); + /* + * We may have inode lists attached to refs in the parents ulist, so we + * must free them before freeing the ulist and its refs. + */ + free_leaf_list(parents); return ret; } @@ -820,16 +839,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct btrfs_key key; - struct btrfs_key tmp_op_key; struct rb_node *n; int count; int ret = 0; - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); - spin_lock(&head->lock); for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, @@ -855,10 +869,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ struct btrfs_delayed_tree_ref *ref; + struct btrfs_key *key_ptr = NULL; + + if (head->extent_op && head->extent_op->update_key) { + btrfs_disk_key_to_cpu(&key, &head->extent_op->key); + key_ptr = &key; + } ref = btrfs_delayed_node_to_tree_ref(node); ret = add_indirect_ref(fs_info, preftrees, ref->root, - &tmp_op_key, ref->level + 1, + key_ptr, ref->level + 1, node->bytenr, count, sc, GFP_ATOMIC); break; @@ -884,13 +904,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, key.offset = ref->offset; /* - * Found a inum that doesn't match our known inum, we - * know it's shared. + * If we have a share check context and a reference for + * another inode, we can't exit immediately. This is + * because even if this is a BTRFS_ADD_DELAYED_REF + * reference we may find next a BTRFS_DROP_DELAYED_REF + * which cancels out this ADD reference. + * + * If this is a DROP reference and there was no previous + * ADD reference, then we need to signal that when we + * process references from the extent tree (through + * add_inline_refs() and add_keyed_refs()), we should + * not exit early if we find a reference for another + * inode, because one of the delayed DROP references + * may cancel that reference in the extent tree. */ - if (sc && sc->inum && ref->objectid != sc->inum) { - ret = BACKREF_FOUND_SHARED; - goto out; - } + if (sc && count < 0) + sc->have_delayed_delete_refs = true; ret = add_indirect_ref(fs_info, preftrees, ref->root, &key, 0, node->bytenr, count, sc, @@ -920,7 +949,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } if (!ret) ret = extent_is_shared(sc); -out: + spin_unlock(&head->lock); return ret; } @@ -1023,7 +1052,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1033,6 +1063,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ret = add_indirect_ref(fs_info, preftrees, root, &key, 0, bytenr, count, sc, GFP_NOFS); + break; } default: @@ -1122,7 +1153,8 @@ static int add_keyed_refs(struct btrfs_root *extent_root, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1354,6 +1386,12 @@ again: if (ret < 0) goto out; ref->inode_list = eie; + /* + * We transferred the list ownership to the ref, + * so set to NULL to avoid a double free in case + * an error happens after this. + */ + eie = NULL; } ret = ulist_add_merge_ptr(refs, ref->parent, ref->inode_list, @@ -1379,6 +1417,14 @@ again: eie->next = ref->inode_list; } eie = NULL; + /* + * We have transferred the inode list ownership from + * this ref to the ref we added to the 'refs' ulist. + * So set this ref's inode list to NULL to avoid + * use-after-free when our caller uses it or double + * frees in case an error happens before we return. + */ + ref->inode_list = NULL; } cond_resched(); } @@ -1395,24 +1441,6 @@ out: return ret; } -static void free_leaf_list(struct ulist *blocks) -{ - struct ulist_node *node = NULL; - struct extent_inode_elem *eie; - struct ulist_iterator uiter; - - ULIST_ITER_INIT(&uiter); - while ((node = ulist_next(blocks, &uiter))) { - if (!node->aux) - continue; - eie = unode_aux_to_inode_list(node); - free_inode_elem_list(eie); - node->aux = 0; - } - - ulist_free(blocks); -} - /* * Finds all leafs with a reference to the specified combination of bytenr and * offset. key_list_head will point to a list of corresponding keys (caller must @@ -1522,6 +1550,9 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache { struct btrfs_backref_shared_cache_entry *entry; + if (!cache->use_cache) + return false; + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) return false; @@ -1557,6 +1588,19 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache return false; *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + cache->entries[i].is_shared = true; + cache->entries[i].gen = entry->gen; + } + } return true; } @@ -1573,6 +1617,9 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, struct btrfs_backref_shared_cache_entry *entry; u64 gen; + if (!cache->use_cache) + return; + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) return; @@ -1648,6 +1695,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, .root_objectid = root->root_key.objectid, .inum = inum, .share_count = 0, + .have_delayed_delete_refs = false, }; int level; @@ -1669,6 +1717,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); + cache->use_cache = true; while (1) { bool is_shared; bool cached; @@ -1698,6 +1747,24 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, extent_gen > btrfs_root_last_snapshot(&root->root_item)) break; + /* + * If our data extent was not directly shared (without multiple + * reference items), than it might have a single reference item + * with a count > 1 for the same offset, which means there are 2 + * (or more) file extent items that point to the data extent - + * this happens when a file extent item needs to be split and + * then one item gets moved to another leaf due to a b+tree leaf + * split when inserting some item. In this case the file extent + * items may be located in different leaves and therefore some + * of the leaves may be referenced through shared subtrees while + * others are not. Since our extent buffer cache only works for + * a single path (by far the most common case and simpler to + * deal with), we can not use it if we have multiple leaves + * (which implies multiple paths). + */ + if (level == -1 && tmp->nnodes > 1) + cache->use_cache = false; + if (level >= 0) store_backref_shared_cache(cache, root, bytenr, level, false); @@ -1713,6 +1780,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, break; } shared.share_count = 0; + shared.have_delayed_delete_refs = false; cond_resched(); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 52ae6957b414..8e69584d538d 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -29,6 +29,7 @@ struct btrfs_backref_shared_cache { * a given data extent should never exceed the maximum b+tree height. */ struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL]; + bool use_cache; }; typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 32c415cfbdfe..deebc8ddbd93 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -774,10 +774,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); out: - /* REVIEW */ if (wait && caching_ctl) ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); - /* wait_event(caching_ctl->wait, space_cache_v1_done(cache)); */ if (caching_ctl) btrfs_put_caching_control(caching_ctl); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 54caa00a2245..f1f051ad3147 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -8,6 +8,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/time.h> @@ -15,6 +16,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/writeback.h> +#include <linux/psi.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> @@ -218,8 +220,7 @@ static noinline void end_compressed_writeback(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; - struct page *pages[16]; - unsigned long nr_pages = end_index - index + 1; + struct folio_batch fbatch; const int errno = blk_status_to_errno(cb->status); int i; int ret; @@ -227,24 +228,23 @@ static noinline void end_compressed_writeback(struct inode *inode, if (errno) mapping_set_error(inode->i_mapping, errno); - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - nr_pages -= 1; - index += 1; - continue; - } + folio_batch_init(&fbatch); + while (index <= end_index) { + ret = filemap_get_folios(inode->i_mapping, &index, end_index, + &fbatch); + + if (ret == 0) + return; + for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (errno) - SetPageError(pages[i]); - btrfs_page_clamp_clear_writeback(fs_info, pages[i], + folio_set_error(folio); + btrfs_page_clamp_clear_writeback(fs_info, &folio->page, cb->start, cb->len); - put_page(pages[i]); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); } /* the inode may be gone now */ } @@ -511,7 +511,8 @@ static u64 bio_end_offset(struct bio *bio) */ static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, - struct compressed_bio *cb) + struct compressed_bio *cb, + unsigned long *pflags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; @@ -580,6 +581,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } + if (PageWorkingset(page)) + psi_memstall_enter(pflags); + ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); @@ -666,6 +670,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; + /* Initialize to 1 to make skip psi_memstall_leave unless needed */ + unsigned long pflags = 1; blk_status_t ret; int ret2; int i; @@ -721,7 +727,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto fail; } - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb, &pflags); /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; @@ -801,6 +807,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } } + if (!pflags) + psi_memstall_leave(&pflags); + if (refcount_dec_and_test(&cb->pending_ios)) finish_compressed_bio_read(cb); return; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 727595eee973..f677b49df8ae 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3462,7 +3462,10 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); -ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); extern const struct dentry_operations btrfs_dentry_operations; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a2da9313c694..4b28263c3d32 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -166,11 +166,9 @@ static bool btrfs_supported_super_csum(u16 csum_type) * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. */ -static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, - char *raw_disk_sb) +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb) { - struct btrfs_super_block *disk_sb = - (struct btrfs_super_block *)raw_disk_sb; char result[BTRFS_CSUM_SIZE]; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -181,7 +179,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, fs_info->csum_size)) @@ -3479,7 +3477,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { + if (btrfs_check_super_csum(fs_info, disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; btrfs_release_disk_super(disk_super); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c67c15d4d20b..9fa923e005a3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -42,6 +42,8 @@ struct extent_buffer *btrfs_find_create_tree_block( void btrfs_clean_tree_block(struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1d4c2397d0d6..fab7eb76e53b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index f32f4113c976..5afb7ca42828 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -19,7 +19,7 @@ struct btrfs_fid { } __attribute__ ((packed)); struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation); struct dentry *btrfs_get_parent(struct dentry *child); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 618275af19c4..83cb0378096f 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1641,16 +1641,17 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, int err; u64 failed_start; - while (1) { + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + cached_state, NULL, GFP_NOFS); + while (err == -EEXIST) { + if (failed_start != start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, cached_state); + + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, cached_state, NULL, GFP_NOFS); - if (err == -EEXIST) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else - break; - WARN_ON(start > end); } return err; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd2d36580f1a..2801c991814f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3295,21 +3295,22 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } /* - * If this is a leaf and there are tree mod log users, we may - * have recorded mod log operations that point to this leaf. - * So we must make sure no one reuses this leaf's extent before - * mod log operations are applied to a node, otherwise after - * rewinding a node using the mod log operations we get an - * inconsistent btree, as the leaf's extent may now be used as - * a node or leaf for another different btree. + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * * We are safe from races here because at this point no other * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins, it will not be able to - * find a node pointing to this leaf and record operations that - * point to this leaf. + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. */ - if (btrfs_header_level(buf) == 0 && - test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) must_pin = true; if (must_pin || btrfs_is_zoned(fs_info)) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1eae68fbae21..4dcf22e051ff 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -270,9 +270,8 @@ static int __process_pages_contig(struct address_space *mapping, pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long nr_pages = end_index - start_index + 1; unsigned long pages_processed = 0; - struct page *pages[16]; + struct folio_batch fbatch; int err = 0; int i; @@ -281,16 +280,17 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) mapping_set_error(mapping, -EIO); - while (nr_pages > 0) { - int found_pages; + folio_batch_init(&fbatch); + while (index <= end_index) { + int found_folios; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); - found_pages = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (found_pages == 0) { + if (found_folios == 0) { /* * Only if we're going to lock these pages, we can find * nothing at @index. @@ -300,23 +300,20 @@ static int __process_pages_contig(struct address_space *mapping, goto out; } - for (i = 0; i < found_pages; i++) { + for (i = 0; i < found_folios; i++) { int process_ret; - + struct folio *folio = fbatch.folios[i]; process_ret = process_one_page(fs_info, mapping, - pages[i], locked_page, page_ops, + &folio->page, locked_page, page_ops, start, end); if (process_ret < 0) { - for (; i < found_pages; i++) - put_page(pages[i]); err = -EAGAIN; + folio_batch_release(&fbatch); goto out; } - put_page(pages[i]); - pages_processed++; + pages_processed += folio_nr_pages(folio); } - nr_pages -= found_pages; - index += found_pages; + folio_batch_release(&fbatch); cond_resched(); } out: diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 176b432035ae..d01631d47806 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1598,14 +1598,19 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, write_bytes); else btrfs_check_nocow_unlock(BTRFS_I(inode)); + + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; break; } release_bytes = reserve_bytes; again: ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); - if (ret) + if (ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); break; + } /* * This is going to setup the pages array with the number of @@ -1765,6 +1770,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; ssize_t err; unsigned int ilock_flags = 0; + struct iomap_dio *dio; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1825,11 +1831,22 @@ relock: * So here we disable page faults in the iov_iter and then retry if we * got -EFAULT, faulting in the pages before the retry. */ -again: from->nofault = true; - err = btrfs_dio_rw(iocb, from, written); + dio = btrfs_dio_write(iocb, from, written); from->nofault = false; + /* + * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync + * iocb, and that needs to lock the inode. So unlock it before calling + * iomap_dio_complete() to avoid a deadlock. + */ + btrfs_inode_unlock(inode, ilock_flags); + + if (IS_ERR_OR_NULL(dio)) + err = PTR_ERR_OR_ZERO(dio); + else + err = iomap_dio_complete(dio); + /* No increment (+=) because iomap returns a cumulative value. */ if (err > 0) written = err; @@ -1855,12 +1872,10 @@ again: } else { fault_in_iov_iter_readable(from, left); prev_left = left; - goto again; + goto relock; } } - btrfs_inode_unlock(inode, ilock_flags); - /* * If 'err' is -ENOTBLK or we have not written all data, then it means * we must fallback to buffered IO. @@ -4035,7 +4050,7 @@ again: */ pagefault_disable(); to->nofault = true; - ret = btrfs_dio_rw(iocb, to, read); + ret = btrfs_dio_read(iocb, to, read); to->nofault = false; pagefault_enable(); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 45ebef8d3ea8..0e516aefbf51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7980,7 +7980,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, */ status = BLK_STS_RESOURCE; dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); - if (!dip) + if (!dip->csums) goto out_err; status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); @@ -8078,13 +8078,21 @@ static const struct iomap_dio_ops btrfs_dio_ops = { .bio_set = &btrfs_dio_bioset, }; -ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { struct btrfs_dio_data data; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC, - &data, done_before); + IOMAP_DIO_PARTIAL, &data, done_before); +} + +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) +{ + struct btrfs_dio_data data; + + return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -10018,7 +10026,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns, } static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -10026,7 +10034,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct inode *inode; struct btrfs_new_inode_args new_inode_args = { .dir = dir, - .dentry = dentry, + .dentry = file->f_path.dentry, .orphan = true, }; unsigned int trans_num_items; @@ -10063,7 +10071,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, set_nlink(inode, 1); if (!ret) { - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); mark_inode_dirty(inode); } @@ -10075,7 +10083,7 @@ out_new_inode_args: out_inode: if (ret) iput(inode); - return ret; + return finish_open_simple(file, ret); } void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f6395e8288d6..82c8e991300e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1632,10 +1632,8 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio) int ret; ret = alloc_rbio_parity_pages(rbio); - if (ret) { - __free_raid_bio(rbio); + if (ret) return ret; - } ret = lock_stripe_add(rbio); if (ret == 0) @@ -1823,8 +1821,10 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) */ if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); - if (ret) + if (ret) { + __free_raid_bio(rbio); goto fail; + } return; } @@ -1838,8 +1838,10 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) list_add_tail(&rbio->plug_list, &plug->rbio_list); } else { ret = __raid56_parity_write(rbio); - if (ret) + if (ret) { + __free_raid_bio(rbio); goto fail; + } } return; @@ -2742,8 +2744,10 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { - BUG(); - kfree(rbio); + btrfs_warn_rl(fs_info, + "can not determine the failed stripe number for full stripe %llu", + bioc->raid_map[0]); + __free_raid_bio(rbio); return NULL; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4ef4167072b8..145c84b44fd0 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -348,6 +348,7 @@ static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) switch (sctx->proto) { case 1: return cmd <= BTRFS_SEND_C_MAX_V1; case 2: return cmd <= BTRFS_SEND_C_MAX_V2; + case 3: return cmd <= BTRFS_SEND_C_MAX_V3; default: return false; } } @@ -6469,7 +6470,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; } - if (sctx->cur_inode_needs_verity) { + + if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY) + && sctx->cur_inode_needs_verity) { ret = process_verity(sctx); if (ret < 0) goto out; @@ -6665,17 +6668,19 @@ static int changed_inode(struct send_ctx *sctx, /* * First, process the inode as if it was deleted. */ - sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = false; - sctx->cur_inode_deleted = true; - sctx->cur_inode_size = btrfs_inode_size( - sctx->right_path->nodes[0], right_ii); - sctx->cur_inode_mode = btrfs_inode_mode( - sctx->right_path->nodes[0], right_ii); - ret = process_all_refs(sctx, - BTRFS_COMPARE_TREE_DELETED); - if (ret < 0) - goto out; + if (old_nlinks > 0) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + ret = process_all_refs(sctx, + BTRFS_COMPARE_TREE_DELETED); + if (ret < 0) + goto out; + } /* * Now process the inode as if it was new. diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 0a4537775e0c..f7585cfa7e52 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -10,7 +10,12 @@ #include <linux/types.h> #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" +/* Conditional support for the upcoming protocol version. */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_SEND_STREAM_VERSION 3 +#else #define BTRFS_SEND_STREAM_VERSION 2 +#endif /* * In send stream v1, no command is larger than 64K. In send stream v2, no limit diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 6fc2b77ae5c3..9a176af847d7 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -337,7 +337,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * * Even with 0 returned, the page still need extra check to make sure * it's really the correct page, as the caller is using - * find_get_pages_contig(), which can race with page invalidating. + * filemap_get_folios_contig(), which can race with page invalidating. */ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9be4fd2db0f4..5942b9384088 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2555,6 +2555,7 @@ static int check_dev_super(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_super_block *sb; + u16 csum_type; int ret = 0; /* This should be called with fs still frozen. */ @@ -2569,6 +2570,21 @@ static int check_dev_super(struct btrfs_device *dev) if (IS_ERR(sb)) return PTR_ERR(sb); + /* Verify the checksum. */ + csum_type = btrfs_super_csum_type(sb); + if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + btrfs_err(fs_info, "csum type changed, has %u expect %u", + csum_type, btrfs_super_csum_type(fs_info->super_copy)); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_check_super_csum(fs_info, sb)) { + btrfs_err(fs_info, "csum for on-disk super block no longer matches"); + ret = -EUCLEAN; + goto out; + } + /* Btrfs_validate_super() includes fsid check against super->fsid. */ ret = btrfs_validate_super(fs_info, sb, 0); if (ret < 0) diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index f69ec4d2d6eb..350da449db08 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -4,6 +4,7 @@ */ #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/sizes.h> @@ -20,39 +21,40 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) { int ret; - struct page *pages[16]; + struct folio_batch fbatch; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - unsigned long nr_pages = end_index - index + 1; int i; int count = 0; int loops = 0; - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, nr_pages, - ARRAY_SIZE(pages)), pages); + folio_batch_init(&fbatch); + + while (index <= end_index) { + ret = filemap_get_folios_contig(inode->i_mapping, &index, + end_index, &fbatch); for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (flags & PROCESS_TEST_LOCKED && - !PageLocked(pages[i])) + !folio_test_locked(folio)) count++; - if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) - unlock_page(pages[i]); - put_page(pages[i]); + if (flags & PROCESS_UNLOCK && folio_test_locked(folio)) + folio_unlock(folio); if (flags & PROCESS_RELEASE) - put_page(pages[i]); + folio_put(folio); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); cond_resched(); loops++; if (loops > 100000) { printk(KERN_ERR - "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n", - start, end, nr_pages, ret); + "stuck in a loop, start %llu, end %llu, ret %d\n", + start, end, ret); break; } } + return count; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index eee1e4459541..63676ea19f29 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -225,20 +225,20 @@ static int test_no_shared_qgroup(struct btrfs_root *root, */ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -250,29 +250,31 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } + /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */ + old_roots = NULL; + new_roots = NULL; + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { test_err("qgroup counts didn't match expected values"); return -EINVAL; } - old_roots = NULL; - new_roots = NULL; ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = remove_extent_item(root, nodesize, nodesize); - if (ret) + if (ret) { + ulist_free(old_roots); return -EINVAL; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -322,20 +324,20 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -355,20 +357,20 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = add_tree_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -394,20 +396,20 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = remove_extent_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 94ba46d57920..a8d4bc6a1937 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7142,6 +7142,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, u64 devid; u64 type; u8 uuid[BTRFS_UUID_SIZE]; + int index; int num_stripes; int ret; int i; @@ -7149,6 +7150,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); + index = btrfs_bg_flags_to_raid_index(type); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); #if BITS_PER_LONG == 32 @@ -7202,7 +7204,15 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->io_align = btrfs_chunk_io_align(leaf, chunk); map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = type; - map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + /* + * We can't use the sub_stripes value, as for profiles other than + * RAID10, they may have 0 as sub_stripes for filesystems created by + * older mkfs (<v5.4). + * In that case, it can cause divide-by-zero errors later. + * Since currently sub_stripes is fixed for each profile, let's + * use the trusted value instead. + */ + map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; em->orig_block_len = btrfs_calc_stripe_length(em); for (i = 0; i < num_stripes; i++) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 599b9d5af349..f8b668dc8bf8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -395,6 +395,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); */ struct btrfs_bio { unsigned int mirror_num; + struct bvec_iter iter; /* for direct I/O */ u64 file_offset; @@ -403,7 +404,6 @@ struct btrfs_bio { struct btrfs_device *device; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - struct bvec_iter iter; /* End I/O information supplied to btrfs_bio_alloc */ btrfs_bio_end_io_t end_io; diff --git a/fs/buffer.c b/fs/buffer.c index 0a7ba84c1905..d9c6d1fbb6dd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -152,7 +152,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) /* * Default synchronous end-of-IO handler.. Just mark it up-to-date and - * unlock the buffer. This is what ll_rw_block uses too. + * unlock the buffer. */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { @@ -491,8 +491,8 @@ int inode_has_buffers(struct inode *inode) * all already-submitted IO to complete, but does not queue any new * writes to the disk. * - * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as - * you dirty the buffers, and then use osync_inode_buffers to wait for + * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer + * as you dirty the buffers, and then use osync_inode_buffers to wait for * completion. Any other dirty buffers which are not yet queued for * write will not be flushed to disk by the osync. */ @@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev, struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) - ll_rw_block(REQ_OP_WRITE, 1, &bh); + write_dirty_buffer(bh, 0); put_bh(bh); } } @@ -1342,23 +1342,12 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); + bh_readahead(bh, REQ_RAHEAD); brelse(bh); } } EXPORT_SYMBOL(__breadahead); -void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size, - gfp_t gfp) -{ - struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); - if (likely(bh)) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); - brelse(bh); - } -} -EXPORT_SYMBOL(__breadahead_gfp); - /** * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from @@ -1464,19 +1453,15 @@ EXPORT_SYMBOL(set_bh_page); static void discard_buffer(struct buffer_head * bh) { - unsigned long b_state, b_state_old; + unsigned long b_state; lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - b_state = bh->b_state; - for (;;) { - b_state_old = cmpxchg(&bh->b_state, b_state, - (b_state & ~BUFFER_FLAGS_DISCARD)); - if (b_state_old == b_state) - break; - b_state = b_state_old; - } + b_state = READ_ONCE(bh->b_state); + do { + } while (!try_cmpxchg(&bh->b_state, &b_state, + b_state & ~BUFFER_FLAGS_DISCARD)); unlock_buffer(bh); } @@ -1817,7 +1802,7 @@ done: /* * The page was marked dirty, but the buffers were * clean. Someone wrote them back by hand with - * ll_rw_block/submit_bh. A rare case. + * write_dirty_buffer/submit_bh. A rare case. */ end_page_writeback(page); @@ -2033,7 +2018,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 1, &bh); + bh_read_nowait(bh, 0); *wait_bh++=bh; } } @@ -2352,7 +2337,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct page *page; - void *fsdata; + void *fsdata = NULL; int err; err = inode_newsize_ok(inode, size); @@ -2378,7 +2363,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); struct page *page; - void *fsdata; + void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; unsigned zerofrom, offset, len; @@ -2593,11 +2578,9 @@ int block_truncate_page(struct address_space *mapping, set_buffer_uptodate(bh); if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { - err = -EIO; - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); + err = bh_read(bh, 0); /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + if (err < 0) goto unlock; } @@ -2725,61 +2708,6 @@ void submit_bh(blk_opf_t opf, struct buffer_head *bh) } EXPORT_SYMBOL(submit_bh); -/** - * ll_rw_block: low-level access to block devices (DEPRECATED) - * @opf: block layer request operation and flags. - * @nr: number of &struct buffer_heads in the array - * @bhs: array of pointers to &struct buffer_head - * - * ll_rw_block() takes an array of pointers to &struct buffer_heads, and - * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. - * @opf contains flags modifying the detailed I/O behavior, most notably - * %REQ_RAHEAD. - * - * This function drops any buffer that it cannot get a lock on (with the - * BH_Lock state bit), any buffer that appears to be clean when doing a write - * request, and any buffer that appears to be up-to-date when doing read - * request. Further it marks as clean buffers that are processed for - * writing (the buffer cache won't assume that they are actually clean - * until the buffer gets unlocked). - * - * ll_rw_block sets b_end_io to simple completion handler that marks - * the buffer up-to-date (if appropriate), unlocks the buffer and wakes - * any waiters. - * - * All of the buffers must be for the same device, and must also be a - * multiple of the current approved size for the device. - */ -void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[]) -{ - const enum req_op op = opf & REQ_OP_MASK; - int i; - - for (i = 0; i < nr; i++) { - struct buffer_head *bh = bhs[i]; - - if (!trylock_buffer(bh)) - continue; - if (op == REQ_OP_WRITE) { - if (test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - submit_bh(opf, bh); - continue; - } - } else { - if (!buffer_uptodate(bh)) { - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(opf, bh); - continue; - } - } - unlock_buffer(bh); - } -} -EXPORT_SYMBOL(ll_rw_block); - void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { lock_buffer(bh); @@ -3026,29 +2954,69 @@ int bh_uptodate_or_lock(struct buffer_head *bh) EXPORT_SYMBOL(bh_uptodate_or_lock); /** - * bh_submit_read - Submit a locked buffer for reading + * __bh_read - Submit read for a locked buffer * @bh: struct buffer_head + * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ + * @wait: wait until reading finish * - * Returns zero on success and -EIO on error. + * Returns zero on success or don't wait, and -EIO on error. */ -int bh_submit_read(struct buffer_head *bh) +int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { - BUG_ON(!buffer_locked(bh)); + int ret = 0; - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - return 0; - } + BUG_ON(!buffer_locked(bh)); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return 0; - return -EIO; + submit_bh(REQ_OP_READ | op_flags, bh); + if (wait) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + ret = -EIO; + } + return ret; +} +EXPORT_SYMBOL(__bh_read); + +/** + * __bh_read_batch - Submit read for a batch of unlocked buffers + * @nr: entry number of the buffer batch + * @bhs: a batch of struct buffer_head + * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ + * @force_lock: force to get a lock on the buffer if set, otherwise drops any + * buffer that cannot lock. + * + * Returns zero on success or don't wait, and -EIO on error. + */ +void __bh_read_batch(int nr, struct buffer_head *bhs[], + blk_opf_t op_flags, bool force_lock) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (buffer_uptodate(bh)) + continue; + + if (force_lock) + lock_buffer(bh); + else + if (!trylock_buffer(bh)) + continue; + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ | op_flags, bh); + } } -EXPORT_SYMBOL(bh_submit_read); +EXPORT_SYMBOL(__bh_read_batch); void __init buffer_init(void) { diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index facf2ebe464b..03ca8f2f657a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -15,9 +15,8 @@ * file or directory. The caller must hold the inode lock. */ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); bool can_use = false; if (!(inode->i_flags & S_KERNEL_FILE)) { @@ -26,21 +25,18 @@ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, can_use = true; } else { trace_cachefiles_mark_failed(object, inode); - pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", - dentry, inode->i_ino); } return can_use; } static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); bool can_use; inode_lock(inode); - can_use = __cachefiles_mark_inode_in_use(object, dentry); + can_use = __cachefiles_mark_inode_in_use(object, inode); inode_unlock(inode); return can_use; } @@ -49,21 +45,17 @@ static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, * Unmark a backing inode. The caller must hold the inode lock. */ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); - inode->i_flags &= ~S_KERNEL_FILE; trace_cachefiles_mark_inactive(object, inode); } static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); - inode_lock(inode); - __cachefiles_unmark_inode_in_use(object, dentry); + __cachefiles_unmark_inode_in_use(object, inode); inode_unlock(inode); } @@ -77,14 +69,12 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, struct cachefiles_cache *cache = object->volume->cache; struct inode *inode = file_inode(file); - if (inode) { - cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry); + cachefiles_do_unmark_inode_in_use(object, inode); - if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { - atomic_long_add(inode->i_blocks, &cache->b_released); - if (atomic_inc_return(&cache->f_released)) - cachefiles_state_changed(cache); - } + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + atomic_long_add(inode->i_blocks, &cache->b_released); + if (atomic_inc_return(&cache->f_released)) + cachefiles_state_changed(cache); } } @@ -164,8 +154,11 @@ retry: inode_lock(d_inode(subdir)); inode_unlock(d_inode(dir)); - if (!__cachefiles_mark_inode_in_use(NULL, subdir)) + if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + subdir, d_inode(subdir)->i_ino); goto mark_error; + } inode_unlock(d_inode(subdir)); @@ -224,9 +217,7 @@ nomem_d_alloc: void cachefiles_put_directory(struct dentry *dir) { if (dir) { - inode_lock(dir->d_inode); - __cachefiles_unmark_inode_in_use(NULL, dir); - inode_unlock(dir->d_inode); + cachefiles_do_unmark_inode_in_use(NULL, d_inode(dir)); dput(dir); } } @@ -410,7 +401,7 @@ try_again: "Rename failed with error %d", ret); } - __cachefiles_unmark_inode_in_use(object, rep); + __cachefiles_unmark_inode_in_use(object, d_inode(rep)); unlock_rename(cache->graveyard, dir); dput(grave); _leave(" = 0"); @@ -451,84 +442,72 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) const struct cred *saved_cred; struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; struct file *file; - struct path path; + const struct path parentpath = { .mnt = cache->mnt, .dentry = fan }; uint64_t ni_size; long ret; cachefiles_begin_secure(cache, &saved_cred); - path.mnt = cache->mnt; ret = cachefiles_inject_write_error(); - if (ret == 0) - path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR); - else - path.dentry = ERR_PTR(ret); - if (IS_ERR(path.dentry)) { - trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(path.dentry), + if (ret == 0) { + file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG, + O_RDWR | O_LARGEFILE | O_DIRECT, + cache->cache_cred); + ret = PTR_ERR_OR_ZERO(file); + } + if (ret) { + trace_cachefiles_vfs_error(object, d_inode(fan), ret, cachefiles_trace_tmpfile_error); - if (PTR_ERR(path.dentry) == -EIO) + if (ret == -EIO) cachefiles_io_error_obj(object, "Failed to create tmpfile"); - file = ERR_CAST(path.dentry); - goto out; + goto err; } - trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry)); + trace_cachefiles_tmpfile(object, file_inode(file)); - if (!cachefiles_mark_inode_in_use(object, path.dentry)) { - file = ERR_PTR(-EBUSY); - goto out_dput; - } + /* This is a newly created file with no other possible user */ + if (!cachefiles_mark_inode_in_use(object, file_inode(file))) + WARN_ON(1); ret = cachefiles_ondemand_init_object(object); - if (ret < 0) { - file = ERR_PTR(ret); - goto out_unuse; - } + if (ret < 0) + goto err_unuse; ni_size = object->cookie->object_size; ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); if (ni_size > 0) { - trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size, + trace_cachefiles_trunc(object, file_inode(file), 0, ni_size, cachefiles_trunc_expand_tmpfile); ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_truncate(&path, ni_size); + ret = vfs_truncate(&file->f_path, ni_size); if (ret < 0) { trace_cachefiles_vfs_error( - object, d_backing_inode(path.dentry), ret, + object, file_inode(file), ret, cachefiles_trace_trunc_error); - file = ERR_PTR(ret); - goto out_unuse; + goto err_unuse; } } - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(path.dentry), cache->cache_cred); - if (IS_ERR(file)) { - trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), - PTR_ERR(file), - cachefiles_trace_open_error); - goto out_unuse; - } + ret = -EINVAL; if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { fput(file); pr_notice("Cache does not support read_iter and write_iter\n"); - file = ERR_PTR(-EINVAL); - goto out_unuse; + goto err_unuse; } - - goto out_dput; - -out_unuse: - cachefiles_do_unmark_inode_in_use(object, path.dentry); -out_dput: - dput(path.dentry); out: cachefiles_end_secure(cache, saved_cred); return file; + +err_unuse: + cachefiles_do_unmark_inode_in_use(object, file_inode(file)); + fput(file); +err: + file = ERR_PTR(ret); + goto out; } /* @@ -569,8 +548,11 @@ static bool cachefiles_open_file(struct cachefiles_object *object, _enter("%pd", dentry); - if (!cachefiles_mark_inode_in_use(object, dentry)) + if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) { + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + dentry, d_inode(dentry)->i_ino); return false; + } /* We need to open a file interface onto a data file now as we can't do * it on demand because writeback called from do_exit() sees @@ -624,7 +606,7 @@ check_failed: error_fput: fput(file); error: - cachefiles_do_unmark_inode_in_use(object, dentry); + cachefiles_do_unmark_inode_in_use(object, d_inode(dentry)); dput(dentry); return false; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 53cfe026b3ea..fb023f9fafcb 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -754,6 +754,7 @@ void ceph_add_cap(struct inode *inode, cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; + wake_up_all(&ci->i_cap_wq); } /* @@ -2285,7 +2286,7 @@ retry: struct ceph_mds_request *req; int i; - sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL); + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); if (!sessions) { err = -ENOMEM; goto out; @@ -2759,13 +2760,17 @@ again: * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. + * + * For RDCACHE|RD -> RD, there is not need to wait and we can + * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; + int exclude = revoking & not; dout("get_cap_refs %p have %s but not %s (revoking %s)\n", inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); - if ((revoking & not) == 0) { + if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { @@ -2787,7 +2792,7 @@ again: snap_rwsem_locked = true; } if ((have & want) == want) - *got = need | want; + *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); @@ -3550,6 +3555,9 @@ static void handle_cap_grant(struct inode *inode, check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ + /* If there is new caps, try to wake up the waiters */ + if (~cap->issued & newcaps) + wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e0fa66ac8b9f..f780e4e0d062 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -181,6 +181,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct inode *inode = __lookup_inode(sb, ino); + struct ceph_inode_info *ci = ceph_inode(inode); int err; if (IS_ERR(inode)) @@ -192,7 +193,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_PTR(err); } /* -ESTALE if inode as been unlinked and no file is open */ - if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { + if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) { iput(inode); return ERR_PTR(-ESTALE); } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 42351d7a0dd6..4af5e55abc15 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (nsplits != ci->i_fragtree_nsplits) { update = true; } else if (nsplits) { - i = prandom_u32() % nsplits; + i = prandom_u32_max(nsplits); id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; @@ -2192,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); inode->i_ctime = attr->ia_ctime; + inode_inc_iversion_raw(inode); } release &= issued; @@ -2356,6 +2357,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, goto out; } + req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR; req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { err = -ENOMEM; @@ -2447,6 +2449,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); + struct super_block *sb = inode->i_sb; struct ceph_inode_info *ci = ceph_inode(inode); u32 valid_mask = STATX_BASIC_STATS; int err = 0; @@ -2476,16 +2479,34 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, } if (ceph_snap(inode) == CEPH_NOSNAP) - stat->dev = inode->i_sb->s_dev; + stat->dev = sb->s_dev; else stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) { stat->size = ci->i_rbytes; - else + } else if (ceph_snap(inode) == CEPH_SNAPDIR) { + struct ceph_inode_info *pci; + struct ceph_snap_realm *realm; + struct inode *parent; + + parent = ceph_lookup_inode(sb, ceph_ino(inode)); + if (!parent) + return PTR_ERR(parent); + + pci = ceph_inode(parent); + spin_lock(&pci->i_ceph_lock); + realm = pci->i_snap_realm; + if (realm) + stat->size = realm->num_snaps; + else + stat->size = 0; + spin_unlock(&pci->i_ceph_lock); + iput(parent); + } else { stat->size = ci->i_files + ci->i_subdirs; + } stat->blocks = 0; stat->blksize = 65536; /* diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 80f8b9ec1a31..26a0a8b9975e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2318,6 +2318,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; + req->r_feature_needed = -1; kref_init(&req->r_kref); RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); @@ -2916,6 +2917,16 @@ static void __do_request(struct ceph_mds_client *mdsc, dout("do_request mds%d session %p state %s\n", mds, session, ceph_session_state_name(session->s_state)); + + /* + * The old ceph will crash the MDSs when see unknown OPs + */ + if (req->r_feature_needed > 0 && + !test_bit(req->r_feature_needed, &session->s_features)) { + err = -EOPNOTSUPP; + goto out_session; + } + if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 256e3eada6c1..0598faa50e2e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -31,8 +31,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_METRIC_COLLECT, CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_OP_GETVXATTR, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -44,6 +45,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ + CEPHFS_FEATURE_OP_GETVXATTR, \ } /* @@ -336,6 +338,8 @@ struct ceph_mds_request { long long r_dir_ordered_cnt; int r_readdir_cache_idx; + int r_feature_needed; + struct ceph_cap_reservation r_caps_reservation; }; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 8d0a6d2c2da4..3fbabc98e1f7 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) return -1; /* pick */ - n = prandom_u32() % n; + n = prandom_u32_max(n); for (j = 0, i = 0; i < m->possible_max_rank; i++) { if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index b401339f6e73..60399081046a 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -5,12 +5,99 @@ * Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com> */ +#include <linux/namei.h> #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" #include "smb2proto.h" #include "cached_dir.h" +static struct cached_fid *init_cached_dir(const char *path); +static void free_cached_dir(struct cached_fid *cfid); + +static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, + const char *path, + bool lookup_only) +{ + struct cached_fid *cfid; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (!strcmp(cfid->path, path)) { + /* + * If it doesn't have a lease it is either not yet + * fully cached or it may be in the process of + * being deleted due to a lease break. + */ + if (!cfid->has_lease) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + kref_get(&cfid->refcount); + spin_unlock(&cfids->cfid_list_lock); + return cfid; + } + } + if (lookup_only) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + if (cfids->num_entries >= MAX_CACHED_FIDS) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + cfid = init_cached_dir(path); + if (cfid == NULL) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + cfid->cfids = cfids; + cfids->num_entries++; + list_add(&cfid->entry, &cfids->entries); + cfid->on_list = true; + kref_get(&cfid->refcount); + spin_unlock(&cfids->cfid_list_lock); + return cfid; +} + +static struct dentry * +path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path) +{ + struct dentry *dentry; + const char *s, *p; + char sep; + + sep = CIFS_DIR_SEP(cifs_sb); + dentry = dget(cifs_sb->root); + s = path; + + do { + struct inode *dir = d_inode(dentry); + struct dentry *child; + + if (!S_ISDIR(dir->i_mode)) { + dput(dentry); + dentry = ERR_PTR(-ENOTDIR); + break; + } + + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + p = s++; + /* next separator */ + while (*s && *s != sep) + s++; + + child = lookup_positive_unlocked(p, dentry, s - p); + dput(dentry); + dentry = child; + } while (!IS_ERR(dentry)); + return dentry; +} + /* * Open the and cache a directory handle. * If error then *cfid is not initialized. @@ -31,54 +118,57 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct kvec qi_iov[1]; int rc, flags = 0; - __le16 utf16_path = 0; /* Null - since an open of top of share */ + __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_II; struct cifs_fid *pfid; - struct dentry *dentry; + struct dentry *dentry = NULL; struct cached_fid *cfid; + struct cached_fids *cfids; - if (tcon == NULL || tcon->nohandlecache || + if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || is_smb1_server(tcon->ses->server)) return -EOPNOTSUPP; ses = tcon->ses; server = ses->server; + cfids = tcon->cfids; - if (cifs_sb->root == NULL) - return -ENOENT; + if (!server->ops->new_lease_key) + return -EIO; - if (strlen(path)) + if (cifs_sb->root == NULL) return -ENOENT; - dentry = cifs_sb->root; + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; - cfid = tcon->cfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->is_valid) { - cifs_dbg(FYI, "found a cached root file handle\n"); + cfid = find_or_create_cached_dir(cfids, path, lookup_only); + if (cfid == NULL) { + kfree(utf16_path); + return -ENOENT; + } + /* + * At this point we either have a lease already and we can just + * return it. If not we are guaranteed to be the only thread accessing + * this cfid. + */ + if (cfid->has_lease) { *ret_cfid = cfid; - kref_get(&cfid->refcount); - mutex_unlock(&cfid->fid_mutex); + kfree(utf16_path); return 0; } /* * We do not hold the lock for the open because in case - * SMB2_open needs to reconnect, it will end up calling - * cifs_mark_open_files_invalid() which takes the lock again - * thus causing a deadlock + * SMB2_open needs to reconnect. + * This is safe because no other thread will be able to get a ref + * to the cfid until we have finished opening the file and (possibly) + * acquired a lease. */ - mutex_unlock(&cfid->fid_mutex); - - if (lookup_only) - return -ENOENT; - if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - if (!server->ops->new_lease_key) - return -EIO; - pfid = &cfid->fid; server->ops->new_lease_key(pfid); @@ -99,7 +189,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.reconnect = false; rc = SMB2_open_init(tcon, server, - &rqst[0], &oplock, &oparms, &utf16_path); + &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto oshr_free; smb2_set_next_command(tcon, &rqst[0]); @@ -122,47 +212,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); - mutex_lock(&cfid->fid_mutex); - - /* - * Now we need to check again as the cached root might have - * been successfully re-opened from a concurrent process - */ - - if (cfid->is_valid) { - /* work was already done */ - - /* stash fids for close() later */ - struct cifs_fid fid = { - .persistent_fid = pfid->persistent_fid, - .volatile_fid = pfid->volatile_fid, - }; - - /* - * caller expects this func to set the fid in cfid to valid - * cached root, so increment the refcount. - */ - kref_get(&cfid->refcount); - - mutex_unlock(&cfid->fid_mutex); - - if (rc == 0) { - /* close extra handle outside of crit sec */ - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); - } - rc = 0; - goto oshr_free; - } - - /* Cached root is still invalid, continue normaly */ - if (rc) { if (rc == -EREMCHG) { tcon->need_reconnect = true; pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); } - goto oshr_exit; + goto oshr_free; } atomic_inc(&tcon->num_remote_opens); @@ -174,30 +230,18 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ - cfid->tcon = tcon; - cfid->is_valid = true; - cfid->dentry = dentry; - dget(dentry); - kref_init(&cfid->refcount); + if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) + goto oshr_free; - /* BB TBD check to see if oplock level check can be removed below */ - if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { - /* - * See commit 2f94a3125b87. Increment the refcount when we - * get a lease for root, release it if lease break occurs - */ - kref_get(&cfid->refcount); - cfid->has_lease = true; - smb2_parse_contexts(server, o_rsp, - &oparms.fid->epoch, - oparms.fid->lease_key, &oplock, - NULL, NULL); - } else - goto oshr_exit; + + smb2_parse_contexts(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key, &oplock, + NULL, NULL); qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) - goto oshr_exit; + goto oshr_free; if (!smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), sizeof(struct smb2_file_all_info), @@ -205,15 +249,42 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, (char *)&cfid->file_all_info)) cfid->file_all_info_is_valid = true; + if (!path[0]) + dentry = dget(cifs_sb->root); + else { + dentry = path_to_dentry(cifs_sb, path); + if (IS_ERR(dentry)) { + rc = -ENOENT; + goto oshr_free; + } + } + cfid->dentry = dentry; + cfid->tcon = tcon; cfid->time = jiffies; + cfid->is_open = true; + cfid->has_lease = true; -oshr_exit: - mutex_unlock(&cfid->fid_mutex); oshr_free: + kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + spin_lock(&cfids->cfid_list_lock); + if (!cfid->has_lease) { + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + } + rc = -ENOENT; + } + spin_unlock(&cfids->cfid_list_lock); + if (rc) { + free_cached_dir(cfid); + cfid = NULL; + } + if (rc == 0) *ret_cfid = cfid; @@ -225,18 +296,22 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct cached_fid **ret_cfid) { struct cached_fid *cfid; + struct cached_fids *cfids = tcon->cfids; - cfid = tcon->cfid; + if (cfids == NULL) + return -ENOENT; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry == dentry) { - cifs_dbg(FYI, "found a cached root file handle by dentry\n"); - *ret_cfid = cfid; - kref_get(&cfid->refcount); - mutex_unlock(&cfid->fid_mutex); - return 0; + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (dentry && cfid->dentry == dentry) { + cifs_dbg(FYI, "found a cached root file handle by dentry\n"); + kref_get(&cfid->refcount); + *ret_cfid = cfid; + spin_unlock(&cfids->cfid_list_lock); + return 0; + } } - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfids->cfid_list_lock); return -ENOENT; } @@ -245,63 +320,50 @@ smb2_close_cached_fid(struct kref *ref) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); - struct cached_dirent *dirent, *q; - if (cfid->is_valid) { - cifs_dbg(FYI, "clear cached root file handle\n"); - SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, - cfid->fid.volatile_fid); + spin_lock(&cfid->cfids->cfid_list_lock); + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfid->cfids->num_entries--; } + spin_unlock(&cfid->cfids->cfid_list_lock); - /* - * We only check validity above to send SMB2_close, - * but we still need to invalidate these entries - * when this function is called - */ - cfid->is_valid = false; - cfid->file_all_info_is_valid = false; - cfid->has_lease = false; - if (cfid->dentry) { - dput(cfid->dentry); - cfid->dentry = NULL; - } - /* - * Delete all cached dirent names - */ - mutex_lock(&cfid->dirents.de_mutex); - list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { - list_del(&dirent->entry); - kfree(dirent->name); - kfree(dirent); + dput(cfid->dentry); + cfid->dentry = NULL; + + if (cfid->is_open) { + SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid); } - cfid->dirents.is_valid = 0; - cfid->dirents.is_failed = 0; - cfid->dirents.ctx = NULL; - cfid->dirents.pos = 0; - mutex_unlock(&cfid->dirents.de_mutex); + free_cached_dir(cfid); } -void close_cached_dir(struct cached_fid *cfid) +void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb) { - mutex_lock(&cfid->fid_mutex); - kref_put(&cfid->refcount, smb2_close_cached_fid); - mutex_unlock(&cfid->fid_mutex); -} + struct cached_fid *cfid = NULL; + int rc; -void close_cached_dir_lease_locked(struct cached_fid *cfid) -{ + rc = open_cached_dir(xid, tcon, name, cifs_sb, true, &cfid); + if (rc) { + cifs_dbg(FYI, "no cached dir found for rmdir(%s)\n", name); + return; + } + spin_lock(&cfid->cfids->cfid_list_lock); if (cfid->has_lease) { cfid->has_lease = false; kref_put(&cfid->refcount, smb2_close_cached_fid); } + spin_unlock(&cfid->cfids->cfid_list_lock); + close_cached_dir(cfid); } -void close_cached_dir_lease(struct cached_fid *cfid) + +void close_cached_dir(struct cached_fid *cfid) { - mutex_lock(&cfid->fid_mutex); - close_cached_dir_lease_locked(cfid); - mutex_unlock(&cfid->fid_mutex); + kref_put(&cfid->refcount, smb2_close_cached_fid); } /* @@ -314,34 +376,60 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) struct cached_fid *cfid; struct cifs_tcon *tcon; struct tcon_link *tlink; + struct cached_fids *cfids; for (node = rb_first(root); node; node = rb_next(node)) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); tcon = tlink_tcon(tlink); if (IS_ERR(tcon)) continue; - cfid = tcon->cfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry) { + cfids = tcon->cfids; + if (cfids == NULL) + continue; + list_for_each_entry(cfid, &cfids->entries, entry) { dput(cfid->dentry); cfid->dentry = NULL; } - mutex_unlock(&cfid->fid_mutex); } } /* - * Invalidate and close all cached dirs when a TCON has been reset + * Invalidate all cached dirs when a TCON has been reset * due to a session loss. */ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) { - mutex_lock(&tcon->cfid->fid_mutex); - tcon->cfid->is_valid = false; - /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ - close_cached_dir_lease_locked(tcon->cfid); - memset(&tcon->cfid->fid, 0, sizeof(struct cifs_fid)); - mutex_unlock(&tcon->cfid->fid_mutex); + struct cached_fids *cfids = tcon->cfids; + struct cached_fid *cfid, *q; + LIST_HEAD(entry); + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + list_move(&cfid->entry, &entry); + cfids->num_entries--; + cfid->is_open = false; + cfid->on_list = false; + /* To prevent race with smb2_cached_lease_break() */ + kref_get(&cfid->refcount); + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + cancel_work_sync(&cfid->lease_break); + if (cfid->has_lease) { + /* + * We lease was never cancelled from the server so we + * need to drop the reference. + */ + spin_lock(&cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); + } + /* Drop the extra reference opened above*/ + kref_put(&cfid->refcount, smb2_close_cached_fid); + } } static void @@ -350,39 +438,121 @@ smb2_cached_lease_break(struct work_struct *work) struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_cached_dir_lease(cfid); + spin_lock(&cfid->cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfid->cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); } int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) { - if (tcon->cfid->is_valid && - !memcmp(lease_key, - tcon->cfid->fid.lease_key, - SMB2_LEASE_KEY_SIZE)) { - tcon->cfid->time = 0; - INIT_WORK(&tcon->cfid->lease_break, - smb2_cached_lease_break); - queue_work(cifsiod_wq, - &tcon->cfid->lease_break); - return true; + struct cached_fids *cfids = tcon->cfids; + struct cached_fid *cfid; + + if (cfids == NULL) + return false; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (cfid->has_lease && + !memcmp(lease_key, + cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE)) { + cfid->time = 0; + /* + * We found a lease remove it from the list + * so no threads can access it. + */ + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + + queue_work(cifsiod_wq, + &cfid->lease_break); + spin_unlock(&cfids->cfid_list_lock); + return true; + } } + spin_unlock(&cfids->cfid_list_lock); return false; } -struct cached_fid *init_cached_dir(void) +static struct cached_fid *init_cached_dir(const char *path) { struct cached_fid *cfid; - cfid = kzalloc(sizeof(*cfid), GFP_KERNEL); + cfid = kzalloc(sizeof(*cfid), GFP_ATOMIC); if (!cfid) return NULL; + cfid->path = kstrdup(path, GFP_ATOMIC); + if (!cfid->path) { + kfree(cfid); + return NULL; + } + + INIT_WORK(&cfid->lease_break, smb2_cached_lease_break); + INIT_LIST_HEAD(&cfid->entry); INIT_LIST_HEAD(&cfid->dirents.entries); mutex_init(&cfid->dirents.de_mutex); - mutex_init(&cfid->fid_mutex); + spin_lock_init(&cfid->fid_lock); + kref_init(&cfid->refcount); return cfid; } -void free_cached_dir(struct cifs_tcon *tcon) +static void free_cached_dir(struct cached_fid *cfid) +{ + struct cached_dirent *dirent, *q; + + dput(cfid->dentry); + cfid->dentry = NULL; + + /* + * Delete all cached dirent names + */ + list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { + list_del(&dirent->entry); + kfree(dirent->name); + kfree(dirent); + } + + kfree(cfid->path); + cfid->path = NULL; + kfree(cfid); +} + +struct cached_fids *init_cached_dirs(void) +{ + struct cached_fids *cfids; + + cfids = kzalloc(sizeof(*cfids), GFP_KERNEL); + if (!cfids) + return NULL; + spin_lock_init(&cfids->cfid_list_lock); + INIT_LIST_HEAD(&cfids->entries); + return cfids; +} + +/* + * Called from tconInfoFree when we are tearing down the tcon. + * There are no active users or open files/directories at this point. + */ +void free_cached_dirs(struct cached_fids *cfids) { - kfree(tcon->cfid); + struct cached_fid *cfid, *q; + LIST_HEAD(entry); + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + cfid->on_list = false; + cfid->is_open = false; + list_move(&cfid->entry, &entry); + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + free_cached_dir(cfid); + } + + kfree(cfids); } diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h index bd262dc8b179..2f4e764c9ca9 100644 --- a/fs/cifs/cached_dir.h +++ b/fs/cifs/cached_dir.h @@ -31,13 +31,17 @@ struct cached_dirents { }; struct cached_fid { - bool is_valid:1; /* Do we have a useable root fid */ - bool file_all_info_is_valid:1; + struct list_head entry; + struct cached_fids *cfids; + const char *path; bool has_lease:1; + bool is_open:1; + bool on_list:1; + bool file_all_info_is_valid:1; unsigned long time; /* jiffies of when lease was taken */ struct kref refcount; struct cifs_fid fid; - struct mutex fid_mutex; + spinlock_t fid_lock; struct cifs_tcon *tcon; struct dentry *dentry; struct work_struct lease_break; @@ -45,8 +49,18 @@ struct cached_fid { struct cached_dirents dirents; }; -extern struct cached_fid *init_cached_dir(void); -extern void free_cached_dir(struct cifs_tcon *tcon); +#define MAX_CACHED_FIDS 16 +struct cached_fids { + /* Must be held when: + * - accessing the cfids->entries list + */ + spinlock_t cfid_list_lock; + int num_entries; + struct list_head entries; +}; + +extern struct cached_fids *init_cached_dirs(void); +extern void free_cached_dirs(struct cached_fids *cfids); extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, @@ -55,8 +69,10 @@ extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct dentry *dentry, struct cached_fid **cfid); extern void close_cached_dir(struct cached_fid *cfid); -extern void close_cached_dir_lease(struct cached_fid *cfid); -extern void close_cached_dir_lease_locked(struct cached_fid *cfid); +extern void drop_cached_dir_by_name(const unsigned int xid, + struct cifs_tcon *tcon, + const char *name, + struct cifs_sb_info *cifs_sb); extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb); extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon); extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index c05477e28cff..90850da390ae 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -87,7 +87,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) { __u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); - seq_printf(m, "%s Mounts: %d ", tcon->treeName, tcon->tc_count); + seq_printf(m, "%s Mounts: %d ", tcon->tree_name, tcon->tc_count); if (tcon->nativeFileSystem) seq_printf(m, "Type: %s ", tcon->nativeFileSystem); seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d", @@ -601,7 +601,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { i++; - seq_printf(m, "\n%d) %s", i, tcon->treeName); + seq_printf(m, "\n%d) %s", i, tcon->tree_name); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); seq_printf(m, "\nSMBs: %d", diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index ee4ea2b60c0f..d44808263cfb 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -108,8 +108,8 @@ do { \ #define cifs_tcon_dbg_func(ratefunc, type, fmt, ...) \ do { \ const char *tn = ""; \ - if (tcon && tcon->treeName) \ - tn = tcon->treeName; \ + if (tcon && tcon->tree_name) \ + tn = tcon->tree_name; \ if ((type) & FYI && cifsFYI & CIFS_INFO) { \ pr_debug_ ## ratefunc("%s: %s " fmt, \ __FILE__, tn, ##__VA_ARGS__); \ @@ -150,7 +150,7 @@ do { \ #define cifs_tcon_dbg(type, fmt, ...) \ do { \ if (0) \ - pr_debug("%s " fmt, tcon->treeName, ##__VA_ARGS__); \ + pr_debug("%s " fmt, tcon->tree_name, ##__VA_ARGS__); \ } while (0) #define cifs_info(fmt, ...) \ diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index b87cbbe6d2d4..d86d78d5bfdc 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -91,6 +91,13 @@ struct smb3_notify { bool watch_tree; } __packed; +struct smb3_notify_info { + __u32 completion_filter; + bool watch_tree; + __u32 data_len; /* size of notify data below */ + __u8 notify_data[]; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) @@ -100,6 +107,7 @@ struct smb3_notify { #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) #define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) +#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info) #define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) /* diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 1e4c7cc5287f..7233c6a7e6d7 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -256,23 +256,23 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) const char *share_name; const char *net_name; - net_name = extract_hostname(tcon->treeName); + net_name = extract_hostname(tcon->tree_name); if (IS_ERR(net_name)) { int ret; ret = PTR_ERR(net_name); cifs_dbg(VFS, "%s: failed to extract host name from target '%s': %d\n", - __func__, tcon->treeName, ret); + __func__, tcon->tree_name, ret); return ERR_PTR(-EINVAL); } - share_name = extract_sharename(tcon->treeName); + share_name = extract_sharename(tcon->tree_name); if (IS_ERR(share_name)) { int ret; ret = PTR_ERR(share_name); cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n", - __func__, tcon->treeName, ret); + __func__, tcon->tree_name, ret); kfree(net_name); return ERR_PTR(-EINVAL); } @@ -335,14 +335,14 @@ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon) goto fail; } - reg->net_name = extract_hostname(tcon->treeName); + reg->net_name = extract_hostname(tcon->tree_name); if (IS_ERR(reg->net_name)) { ret = PTR_ERR(reg->net_name); cifs_dbg(VFS, "%s: failed to extract host name from target: %d\n", __func__, ret); goto fail_idr; } - reg->share_name = extract_sharename(tcon->treeName); + reg->share_name = extract_sharename(tcon->tree_name); if (IS_ERR(reg->share_name)) { ret = PTR_ERR(reg->share_name); cifs_dbg(VFS, "%s: failed to extract share name from target: %d\n", __func__, ret); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 46f5718754f9..5db73c0f792a 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -103,26 +103,24 @@ static int cifs_calc_signature(struct smb_rqst *rqst, if (!rqst->rq_iov || !signature || !server) return -EINVAL; - rc = cifs_alloc_hash("md5", &server->secmech.md5, - &server->secmech.sdescmd5); + rc = cifs_alloc_hash("md5", &server->secmech.md5); if (rc) return -1; - rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + rc = crypto_shash_init(server->secmech.md5); if (rc) { cifs_dbg(VFS, "%s: Could not init md5\n", __func__); return rc; } - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(server->secmech.md5, server->session_key.response, server->session_key.len); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } - return __cifs_calc_signature(rqst, server, signature, - &server->secmech.sdescmd5->shash); + return __cifs_calc_signature(rqst, server, signature, server->secmech.md5); } /* must be called with server->srv_mutex held */ @@ -412,7 +410,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, wchar_t *domain; wchar_t *server; - if (!ses->server->secmech.sdeschmacmd5) { + if (!ses->server->secmech.hmacmd5) { cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } @@ -420,14 +418,14 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash, nls_cp); - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__); return rc; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; @@ -448,7 +446,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, memset(user, '\0', 2); } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, (char *)user, 2 * len); kfree(user); if (rc) { @@ -468,7 +466,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, nls_cp); rc = - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + crypto_shash_update(ses->server->secmech.hmacmd5, (char *)domain, 2 * len); kfree(domain); if (rc) { @@ -488,7 +486,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); rc = - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + crypto_shash_update(ses->server->secmech.hmacmd5, (char *)server, 2 * len); kfree(server); if (rc) { @@ -498,7 +496,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } } - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ntlmv2_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -518,12 +516,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE + offsetof(struct ntlmv2_resp, challenge.key[0])); - if (!ses->server->secmech.sdeschmacmd5) { + if (!ses->server->secmech.hmacmd5) { cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", @@ -531,7 +529,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return rc; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; @@ -543,7 +541,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) else memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, ntlmv2->challenge.key, hash_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); @@ -551,7 +549,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) } /* Note that the MD5 digest over writes anon.challenge_key.key */ - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ntlmv2->ntlmv2_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -627,9 +625,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) cifs_server_lock(ses->server); - rc = cifs_alloc_hash("hmac(md5)", - &ses->server->secmech.hmacmd5, - &ses->server->secmech.sdeschmacmd5); + rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5); if (rc) { goto unlock; } @@ -649,7 +645,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } /* now calculate the session key for NTLMv2 */ - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", @@ -657,13 +653,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto unlock; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); goto unlock; } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { @@ -671,7 +667,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto unlock; } - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ses->auth_key.response); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -679,7 +675,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) unlock: cifs_server_unlock(ses->server); setup_ntlmv2_rsp_ret: - kfree(tiblob); + kfree_sensitive(tiblob); return rc; } @@ -718,49 +714,19 @@ calc_seckey(struct cifs_ses *ses) void cifs_crypto_secmech_release(struct TCP_Server_Info *server) { - if (server->secmech.cmacaes) { - crypto_free_shash(server->secmech.cmacaes); - server->secmech.cmacaes = NULL; - } - - if (server->secmech.hmacsha256) { - crypto_free_shash(server->secmech.hmacsha256); - server->secmech.hmacsha256 = NULL; - } - - if (server->secmech.md5) { - crypto_free_shash(server->secmech.md5); - server->secmech.md5 = NULL; - } + cifs_free_hash(&server->secmech.aes_cmac); + cifs_free_hash(&server->secmech.hmacsha256); + cifs_free_hash(&server->secmech.md5); + cifs_free_hash(&server->secmech.sha512); + cifs_free_hash(&server->secmech.hmacmd5); - if (server->secmech.sha512) { - crypto_free_shash(server->secmech.sha512); - server->secmech.sha512 = NULL; + if (server->secmech.enc) { + crypto_free_aead(server->secmech.enc); + server->secmech.enc = NULL; } - if (server->secmech.hmacmd5) { - crypto_free_shash(server->secmech.hmacmd5); - server->secmech.hmacmd5 = NULL; + if (server->secmech.dec) { + crypto_free_aead(server->secmech.dec); + server->secmech.dec = NULL; } - - if (server->secmech.ccmaesencrypt) { - crypto_free_aead(server->secmech.ccmaesencrypt); - server->secmech.ccmaesencrypt = NULL; - } - - if (server->secmech.ccmaesdecrypt) { - crypto_free_aead(server->secmech.ccmaesdecrypt); - server->secmech.ccmaesdecrypt = NULL; - } - - kfree(server->secmech.sdesccmacaes); - server->secmech.sdesccmacaes = NULL; - kfree(server->secmech.sdeschmacsha256); - server->secmech.sdeschmacsha256 = NULL; - kfree(server->secmech.sdeschmacmd5); - server->secmech.sdeschmacmd5 = NULL; - kfree(server->secmech.sdescmd5); - server->secmech.sdescmd5 = NULL; - kfree(server->secmech.sdescsha512); - server->secmech.sdescsha512 = NULL; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8042d7280dec..fe220686bba4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -396,6 +396,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->epoch = 0; spin_lock_init(&cifs_inode->open_file_lock); generate_random_uuid(cifs_inode->lease_key); + cifs_inode->symlink_target = NULL; /* * Can not set i_flags here - they get immediately overwritten to zero @@ -412,7 +413,11 @@ cifs_alloc_inode(struct super_block *sb) static void cifs_free_inode(struct inode *inode) { - kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); + struct cifsInodeInfo *cinode = CIFS_I(inode); + + if (S_ISLNK(inode->i_mode)) + kfree(cinode->symlink_target); + kmem_cache_free(cifs_inode_cachep, cinode); } static void @@ -1138,6 +1143,30 @@ const struct inode_operations cifs_file_inode_ops = { .fiemap = cifs_fiemap, }; +const char *cifs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) +{ + char *target_path; + + target_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!target_path) + return ERR_PTR(-ENOMEM); + + spin_lock(&inode->i_lock); + if (likely(CIFS_I(inode)->symlink_target)) { + strscpy(target_path, CIFS_I(inode)->symlink_target, PATH_MAX); + } else { + kfree(target_path); + target_path = ERR_PTR(-EOPNOTSUPP); + } + spin_unlock(&inode->i_lock); + + if (!IS_ERR(target_path)) + set_delayed_call(done, kfree_link, target_path); + + return target_path; +} + const struct inode_operations cifs_symlink_inode_ops = { .get_link = cifs_get_link, .permission = cifs_permission, @@ -1297,8 +1326,11 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, ssize_t rc; struct cifsFileInfo *cfile = dst_file->private_data; - if (cfile->swapfile) - return -EOPNOTSUPP; + if (cfile->swapfile) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff, len, flags); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 5b4a7a32bdc5..388b745a978e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 39 -#define CIFS_VERSION "2.39" +#define SMB3_PRODUCT_BUILD 40 +#define CIFS_VERSION "2.40" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ae7f571a7dba..1420acf987f0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -153,26 +153,16 @@ struct session_key { char *response; }; -/* crypto security descriptor definition */ -struct sdesc { - struct shash_desc shash; - char ctx[]; -}; - /* crypto hashing related structure/fields, not specific to a sec mech */ struct cifs_secmech { - struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ - struct crypto_shash *md5; /* md5 hash function */ - struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ - struct crypto_shash *cmacaes; /* block-cipher based MAC function */ - struct crypto_shash *sha512; /* sha512 hash function */ - struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ - struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ - struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ - struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ - struct sdesc *sdescsha512; /* ctxt to generate smb3.11 signing key */ - struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */ - struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */ + struct shash_desc *hmacmd5; /* hmacmd5 hash function, for NTLMv2/CR1 hashes */ + struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */ + struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */ + struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */ + struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */ + + struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */ + struct crypto_aead *dec; /* smb3 decryption AEAD TFM (AES-CCM and AES-GCM) */ }; /* per smb session structure/fields */ @@ -195,6 +185,19 @@ struct cifs_cred { struct cifs_ace *aces; }; +struct cifs_open_info_data { + char *symlink_target; + union { + struct smb2_file_all_info fi; + struct smb311_posix_qinfo posix_fi; + }; +}; + +static inline void cifs_free_open_info(struct cifs_open_info_data *data) +{ + kfree(data->symlink_target); +} + /* ***************************************************************** * Except the CIFS PDUs themselves all the @@ -317,20 +320,20 @@ struct smb_version_operations { int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *); /* query path data from the server */ - int (*query_path_info)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - FILE_ALL_INFO *, bool *, bool *); + int (*query_path_info)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); /* query file data from the server */ - int (*query_file_info)(const unsigned int, struct cifs_tcon *, - struct cifs_fid *, FILE_ALL_INFO *); + int (*query_file_info)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data); /* query reparse tag from srv to determine which type of special file */ int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); /* get server index number */ - int (*get_srv_inum)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - u64 *uniqueid, FILE_ALL_INFO *); + int (*get_srv_inum)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, u64 *uniqueid, + struct cifs_open_info_data *data); /* set size by path */ int (*set_path_size)(const unsigned int, struct cifs_tcon *, const char *, __u64, struct cifs_sb_info *, bool); @@ -379,8 +382,8 @@ struct smb_version_operations { struct cifs_sb_info *, const char *, char **, bool); /* open a file for non-posix mounts */ - int (*open)(const unsigned int, struct cifs_open_parms *, - __u32 *, FILE_ALL_INFO *); + int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf); /* set fid protocol-specific info */ void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); /* close a file */ @@ -451,7 +454,7 @@ struct smb_version_operations { int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *src_file, void __user *); int (*notify)(const unsigned int xid, struct file *pfile, - void __user *pbuf); + void __user *pbuf, bool return_changes); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -1133,6 +1136,7 @@ struct cifs_fattr { struct timespec64 cf_mtime; struct timespec64 cf_ctime; u32 cf_cifstag; + char *cf_symlink_target; }; /* @@ -1149,7 +1153,7 @@ struct cifs_tcon { struct list_head openFileList; spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ - char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ + char tree_name[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; char *password; /* for share-level security */ __u32 tid; /* The 4 byte tree id */ @@ -1228,7 +1232,7 @@ struct cifs_tcon { struct fscache_volume *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ - struct cached_fid *cfid; /* Cached root fid */ + struct cached_fids *cfids; /* BB add field for back pointer to sb struct(s)? */ #ifdef CONFIG_CIFS_DFS_UPCALL struct list_head ulist; /* cache update list */ @@ -1395,6 +1399,7 @@ struct cifsFileInfo { struct work_struct put; /* work for the final part of _put */ struct delayed_work deferred; bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ + char *symlink_target; }; struct cifs_io_parms { @@ -1553,6 +1558,7 @@ struct cifsInodeInfo { struct list_head deferred_closes; /* list of deferred closes */ spinlock_t deferred_lock; /* protection on deferred list */ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ + char *symlink_target; }; static inline struct cifsInodeInfo * @@ -2121,4 +2127,14 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses) return sizeof(ses->workstation_name); } +static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src) +{ + memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src)); + dst->AccessFlags = src->AccessFlags; + dst->CurrentByteOffset = src->CurrentByteOffset; + dst->Mode = src->Mode; + dst->AlignmentRequirement = src->AlignmentRequirement; + dst->FileNameLength = src->FileNameLength; +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index aeba371c4c70..d1abaeea974a 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -483,7 +483,7 @@ put_bcc(__u16 count, struct smb_hdr *hdr) typedef struct negotiate_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; - unsigned char DialectsArray[1]; + unsigned char DialectsArray[]; } __attribute__((packed)) NEGOTIATE_REQ; #define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */ @@ -508,13 +508,14 @@ typedef struct negotiate_rsp { __u8 EncryptionKeyLength; __u16 ByteCount; union { - unsigned char EncryptionKey[1]; /* cap extended security off */ + /* cap extended security off */ + DECLARE_FLEX_ARRAY(unsigned char, EncryptionKey); /* followed by Domain name - if extended security is off */ /* followed by 16 bytes of server GUID */ /* then security blob if cap_extended_security negotiated */ struct { unsigned char GUID[SMB1_CLIENT_GUID_SIZE]; - unsigned char SecurityBlob[1]; + unsigned char SecurityBlob[]; } __attribute__((packed)) extended_response; } __attribute__((packed)) u; } __attribute__((packed)) NEGOTIATE_RSP; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3bc94bcc7177..83e83d8beabb 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -182,10 +182,9 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile, extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); extern void cifs_down_write(struct rw_semaphore *sem); -extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, - struct file *file, - struct tcon_link *tlink, - __u32 oplock); +struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, + struct tcon_link *tlink, __u32 oplock, + const char *symlink_target); extern int cifs_posix_open(const char *full_path, struct inode **inode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *oplock, __u16 *netfid, @@ -200,9 +199,9 @@ extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); extern struct inode *cifs_iget(struct super_block *sb, struct cifs_fattr *fattr); -extern int cifs_get_inode_info(struct inode **inode, const char *full_path, - FILE_ALL_INFO *data, struct super_block *sb, - int xid, const struct cifs_fid *fid); +int cifs_get_inode_info(struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct super_block *sb, int xid, + const struct cifs_fid *fid); extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_get_inode_info_unix(struct inode **pinode, @@ -598,9 +597,8 @@ struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); -int cifs_alloc_hash(const char *name, struct crypto_shash **shash, - struct sdesc **sdesc); -void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); +int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); +void cifs_free_hash(struct shash_desc **sdesc); extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset); @@ -639,7 +637,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses, int cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); int -SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon); +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7aa91e272027..1724066c1536 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -465,7 +465,7 @@ CIFSSMBNegotiate(const unsigned int xid, for (i = 0; i < CIFS_NUM_PROT; i++) { size_t len = strlen(protocols[i].name) + 1; - memcpy(pSMB->DialectsArray+count, protocols[i].name, len); + memcpy(&pSMB->DialectsArray[count], protocols[i].name, len); count += len; } inc_rfc1001_len(pSMB, count); @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, remap); } rename_info->target_name_len = cpu_to_le32(2 * len_of_str); - count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str); + count = sizeof(struct set_file_rename) + (2 * len_of_str); byte_count += count; pSMB->DataCount = cpu_to_le16(count); pSMB->TotalDataCount = pSMB->DataCount; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7ae6f2c08153..1cc47dd3b4d6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -155,7 +155,7 @@ static void smb2_query_server_interfaces(struct work_struct *work) /* * query server network interfaces, in case they change */ - rc = SMB3_request_interfaces(0, tcon); + rc = SMB3_request_interfaces(0, tcon, false); if (rc) { cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); @@ -311,7 +311,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) } server->sequence_number = 0; server->session_estab = false; - kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; server->lstrp = jiffies; @@ -1580,10 +1580,11 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) cifs_crypto_secmech_release(server); - kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; kfree(server->hostname); + server->hostname = NULL; task = xchg(&server->tsk, NULL); if (task) @@ -1940,7 +1941,8 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&ses->ses_lock); cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); - cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE"); + cifs_dbg(FYI, + "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE"); spin_lock(&cifs_tcp_ses_lock); if (--ses->ses_count > 0) { @@ -2293,7 +2295,7 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { if (tcon->status == TID_EXITING) return 0; - if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE)) + if (strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE)) return 0; if (tcon->seal != ctx->seal) return 0; @@ -2831,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * sessinit is sent but no second negprot */ struct rfc1002_session_packet *ses_init_buf; + unsigned int req_noscope_len; struct smb_hdr *smb_buf; + ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), GFP_KERNEL); + if (ses_init_buf) { ses_init_buf->trailer.session_req.called_len = 32; @@ -2869,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) ses_init_buf->trailer.session_req.scope2 = 0; smb_buf = (struct smb_hdr *)ses_init_buf; - /* sizeof RFC1002_SESSION_REQUEST with no scope */ - smb_buf->smb_buf_length = cpu_to_be32(0x81000044); + /* sizeof RFC1002_SESSION_REQUEST with no scopes */ + req_noscope_len = sizeof(struct rfc1002_session_packet) - 2; + + /* == cpu_to_be32(0x81000044) */ + smb_buf->smb_buf_length = + cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len); rc = smb_send(server, smb_buf, 0x44); kfree(ses_init_buf); /* @@ -3921,12 +3930,11 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, pSMB->AndXCommand = 0xFF; pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); bcc_ptr = &pSMB->Password[0]; - if (tcon->pipe || (ses->server->sec_mode & SECMODE_USER)) { - pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ - *bcc_ptr = 0; /* password is null byte */ - bcc_ptr++; /* skip password */ - /* already aligned so no need to do it below */ - } + + pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ + *bcc_ptr = 0; /* password is null byte */ + bcc_ptr++; /* skip password */ + /* already aligned so no need to do it below */ if (ses->server->sign) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; @@ -3989,7 +3997,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, } bcc_ptr += length + 1; bytes_left -= (length + 1); - strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name)); /* mostly informational -- no need to fail on error here */ kfree(tcon->nativeFileSystem); @@ -4134,7 +4142,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (ses->auth_key.response) { cifs_dbg(FYI, "Free previous auth_key.response = %p\n", ses->auth_key.response); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; ses->auth_key.len = 0; } @@ -4197,7 +4205,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->local_nls = cifs_sb->local_nls; ctx->linux_uid = fsuid; ctx->cred_uid = fsuid; - ctx->UNC = master_tcon->treeName; + ctx->UNC = master_tcon->tree_name; ctx->retry = master_tcon->retry; ctx->nocase = master_tcon->nocase; ctx->nohandlecache = master_tcon->nohandlecache; @@ -4663,7 +4671,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */ if (!server->current_fullpath || dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) { - rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, cifs_sb->local_nls); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls); goto out; } @@ -4707,7 +4715,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru tcon->status = TID_IN_TCON; spin_unlock(&tcon->tc_lock); - rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, nlsc); if (rc) { spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index a9b6c3eba6de..e70915ad7541 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -98,7 +98,7 @@ static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const c get_ipc_unc(path, unc, sizeof(unc)); for (; *ses; ses++) { - if (!strcasecmp(unc, (*ses)->tcon_ipc->treeName)) + if (!strcasecmp(unc, (*ses)->tcon_ipc->tree_name)) return *ses; } return ERR_PTR(-ENOENT); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 08f7392716e2..8b1c37158556 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -50,7 +50,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s } if (add_treename) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1); else dfsplen = 0; @@ -59,7 +59,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s return full_path; if (dfsplen) - memcpy(full_path, tcon->treeName, dfsplen); + memcpy(full_path, tcon->tree_name, dfsplen); full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb); memcpy(full_path + dfsplen + 1, ctx->prepath, pplen); convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); @@ -93,7 +93,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, return ERR_PTR(-ENOMEM); if (prefix) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1); else dfsplen = 0; @@ -123,7 +123,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, } if (dfsplen) { s -= dfsplen; - memcpy(s, tcon->treeName, dfsplen); + memcpy(s, tcon->tree_name, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { int i; for (i = 0; i < dfsplen; i++) { @@ -165,10 +165,9 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon) /* Inode operations in similar order to how they appear in Linux file fs.h */ -static int -cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, - struct tcon_link *tlink, unsigned oflags, umode_t mode, - __u32 *oplock, struct cifs_fid *fid) +static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, + struct tcon_link *tlink, unsigned int oflags, umode_t mode, __u32 *oplock, + struct cifs_fid *fid, struct cifs_open_info_data *buf) { int rc = -ENOENT; int create_options = CREATE_NOT_DIR; @@ -177,7 +176,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, struct cifs_tcon *tcon = tlink_tcon(tlink); const char *full_path; void *page = alloc_dentry_path(); - FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; int disposition; struct TCP_Server_Info *server = tcon->ses->server; @@ -290,12 +288,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, goto out; } - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto out; - } - /* * if we're not using unix extensions, see if we need to set * ATTR_READONLY on the create call @@ -364,8 +356,7 @@ cifs_create_get_file_info: { #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* TODO: Add support for calling POSIX query info here, but passing in fid */ - rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, - xid, fid); + rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, xid, fid); if (newinode) { if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); @@ -402,7 +393,6 @@ cifs_create_set_dentry: d_add(direntry, newinode); out: - kfree(buf); free_dentry_path(page); return rc; @@ -423,10 +413,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; __u32 oplock; struct cifsFileInfo *file_info; + struct cifs_open_info_data buf = {}; if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) return -EIO; @@ -484,8 +475,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, cifs_add_pending_open(&fid, tlink, &open); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid); - + &oplock, &fid, &buf); if (rc) { cifs_del_pending_open(&open); goto out; @@ -510,7 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, file->f_op = &cifs_file_direct_ops; } - file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); + file_info = cifs_new_fileinfo(&fid, file, tlink, oplock, buf.symlink_target); if (file_info == NULL) { if (server->ops->close) server->ops->close(xid, tcon, &fid); @@ -526,6 +516,7 @@ out: cifs_put_tlink(tlink); out_free_xid: free_xid(xid); + cifs_free_open_info(&buf); return rc; } @@ -547,12 +538,15 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, struct TCP_Server_Info *server; struct cifs_fid fid; __u32 oplock; + struct cifs_open_info_data buf = {}; cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", inode, direntry, direntry); - if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) - return -EIO; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) { + rc = -EIO; + goto out_free_xid; + } tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); @@ -565,11 +559,11 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, if (server->ops->new_lease_key) server->ops->new_lease_key(&fid); - rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid); + rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, &oplock, &fid, &buf); if (!rc && server->ops->close) server->ops->close(xid, tcon, &fid); + cifs_free_open_info(&buf); cifs_put_tlink(tlink); out_free_xid: free_xid(xid); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6f38b134a346..cd9698209930 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -209,16 +209,14 @@ posix_open_ret: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -static int -cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, - struct cifs_fid *fid, unsigned int xid) +static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, + struct cifs_fid *fid, unsigned int xid, struct cifs_open_info_data *buf) { int rc; int desired_access; int disposition; int create_options = CREATE_NOT_DIR; - FILE_ALL_INFO *buf; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; @@ -255,10 +253,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci /* BB pass O_SYNC flag through on file attributes .. BB */ - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (!buf) - return -ENOMEM; - /* O_SYNC also has bit for O_DSYNC so following check picks up either */ if (f_flags & O_SYNC) create_options |= CREATE_WRITE_THROUGH; @@ -276,9 +270,8 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci oparms.reconnect = false; rc = server->ops->open(xid, &oparms, oplock, buf); - if (rc) - goto out; + return rc; /* TODO: Add support for calling posix query info but with passing in fid */ if (tcon->unix_ext) @@ -294,8 +287,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci rc = -EOPENSTALE; } -out: - kfree(buf); return rc; } @@ -325,9 +316,9 @@ cifs_down_write(struct rw_semaphore *sem) static void cifsFileInfo_put_work(struct work_struct *work); -struct cifsFileInfo * -cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, - struct tcon_link *tlink, __u32 oplock) +struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, + struct tcon_link *tlink, __u32 oplock, + const char *symlink_target) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); @@ -347,6 +338,15 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, return NULL; } + if (symlink_target) { + cfile->symlink_target = kstrdup(symlink_target, GFP_KERNEL); + if (!cfile->symlink_target) { + kfree(fdlocks); + kfree(cfile); + return NULL; + } + } + INIT_LIST_HEAD(&fdlocks->locks); fdlocks->cfile = cfile; cfile->llist = fdlocks; @@ -440,6 +440,7 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); cifs_sb_deactive(sb); + kfree(cifs_file->symlink_target); kfree(cifs_file); } @@ -488,7 +489,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, struct cifsInodeInfo *cifsi = CIFS_I(inode); struct super_block *sb = inode->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; bool oplock_break_cancelled; @@ -570,8 +571,9 @@ int cifs_open(struct inode *inode, struct file *file) void *page; const char *full_path; bool posix_open_ok = false; - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; + struct cifs_open_info_data data = {}; xid = get_xid(); @@ -662,15 +664,15 @@ int cifs_open(struct inode *inode, struct file *file) if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); - rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, - file->f_flags, &oplock, &fid, xid); + rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, file->f_flags, &oplock, &fid, + xid, &data); if (rc) { cifs_del_pending_open(&open); goto out; } } - cfile = cifs_new_fileinfo(&fid, file, tlink, oplock); + cfile = cifs_new_fileinfo(&fid, file, tlink, oplock, data.symlink_target); if (cfile == NULL) { if (server->ops->close) server->ops->close(xid, tcon, &fid); @@ -712,6 +714,7 @@ out: free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); + cifs_free_open_info(&data); return rc; } @@ -1882,11 +1885,13 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) struct cifsFileInfo *cfile; __u32 type; - rc = -EACCES; xid = get_xid(); - if (!(fl->fl_flags & FL_FLOCK)) - return -ENOLCK; + if (!(fl->fl_flags & FL_FLOCK)) { + rc = -ENOLCK; + free_xid(xid); + return rc; + } cfile = (struct cifsFileInfo *)file->private_data; tcon = tlink_tcon(cfile->tlink); @@ -1905,8 +1910,9 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) * if no lock or unlock then nothing to do since we do not * know what it is */ + rc = -EOPNOTSUPP; free_xid(xid); - return -EOPNOTSUPP; + return rc; } rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock, @@ -2428,12 +2434,16 @@ cifs_writev_complete(struct work_struct *work) struct cifs_writedata * cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) { + struct cifs_writedata *writedata = NULL; struct page **pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (pages) - return cifs_writedata_direct_alloc(pages, complete); + if (pages) { + writedata = cifs_writedata_direct_alloc(pages, complete); + if (!writedata) + kvfree(pages); + } - return NULL; + return writedata; } struct cifs_writedata * @@ -3293,6 +3303,9 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, cifs_uncached_writev_complete); if (!wdata) { rc = -ENOMEM; + for (i = 0; i < nr_pages; i++) + put_page(pagevec[i]); + kvfree(pagevec); add_credits_and_wake_if(server, credits, 0); break; } @@ -4271,6 +4284,15 @@ static ssize_t __cifs_readv( len = ctx->len; } + if (direct) { + rc = filemap_write_and_wait_range(file->f_inode->i_mapping, + offset, offset + len - 1); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EAGAIN; + } + } + /* grab a lock here due to read response handlers can access ctx */ mutex_lock(&ctx->aio_mutex); diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 0e13dec86b25..45119597c765 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -791,6 +791,13 @@ do { \ cifs_sb->ctx->field = NULL; \ } while (0) +#define STEAL_STRING_SENSITIVE(cifs_sb, ctx, field) \ +do { \ + kfree_sensitive(ctx->field); \ + ctx->field = cifs_sb->ctx->field; \ + cifs_sb->ctx->field = NULL; \ +} while (0) + static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); @@ -811,7 +818,7 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); - STEAL_STRING(cifs_sb, ctx, password); + STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); @@ -1162,7 +1169,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } break; case Opt_pass: - kfree(ctx->password); + kfree_sensitive(ctx->password); ctx->password = NULL; if (strlen(param->string) == 0) break; @@ -1470,6 +1477,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, return 0; cifs_parse_mount_err: + kfree_sensitive(ctx->password); return -EINVAL; } diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 23ef56f55ce5..a1751b956318 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -45,7 +45,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) memset(&key, 0, sizeof(key)); - sharename = extract_sharename(tcon->treeName); + sharename = extract_sharename(tcon->tree_name); if (IS_ERR(sharename)) { cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); return -EINVAL; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index bac08c20f559..4e2ca3c6e5c0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -210,6 +210,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) */ inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9; } + + if (S_ISLNK(fattr->cf_mode)) { + kfree(cifs_i->symlink_target); + cifs_i->symlink_target = fattr->cf_symlink_target; + fattr->cf_symlink_target = NULL; + } spin_unlock(&inode->i_lock); if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) @@ -347,13 +353,22 @@ cifs_get_file_info_unix(struct file *filp) int rc; unsigned int xid; FILE_UNIX_BASIC_INFO find_data; - struct cifs_fattr fattr; + struct cifs_fattr fattr = {}; struct inode *inode = file_inode(filp); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); xid = get_xid(); + + if (cfile->symlink_target) { + fattr.cf_symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!fattr.cf_symlink_target) { + rc = -ENOMEM; + goto cifs_gfiunix_out; + } + } + rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data); if (!rc) { cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); @@ -378,6 +393,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, FILE_UNIX_BASIC_INFO find_data; struct cifs_fattr fattr; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -387,10 +403,12 @@ int cifs_get_inode_info_unix(struct inode **pinode, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; /* could have done a find first instead but this returns more info */ rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, cifs_sb->local_nls, cifs_remap(cifs_sb)); + cifs_dbg(FYI, "%s: query path info: rc = %d\n", __func__, rc); cifs_put_tlink(tlink); if (!rc) { @@ -410,6 +428,17 @@ int cifs_get_inode_info_unix(struct inode **pinode, cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } + if (S_ISLNK(fattr.cf_mode) && !fattr.cf_symlink_target) { + if (!server->ops->query_symlink) + return -EOPNOTSUPP; + rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, + &fattr.cf_symlink_target, false); + if (rc) { + cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc); + goto cgiiu_exit; + } + } + if (*pinode == NULL) { /* get new inode */ cifs_fill_uniqueid(sb, &fattr); @@ -432,6 +461,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, } cgiiu_exit: + kfree(fattr.cf_symlink_target); return rc; } #else @@ -601,10 +631,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from POSIX info struct */ -static void -smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *info, - struct super_block *sb, bool adjust_tz, bool symlink) +static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct super_block *sb, bool adjust_tz, bool symlink) { + struct smb311_posix_qinfo *info = &data->posix_fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -639,6 +669,8 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * if (symlink) { fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode |= S_IFDIR; fattr->cf_dtype = DT_DIR; @@ -655,13 +687,11 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); } - -/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ -static void -cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, - struct super_block *sb, bool adjust_tz, - bool symlink, u32 reparse_tag) +static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct super_block *sb, bool adjust_tz, bool symlink, + u32 reparse_tag) { + struct smb2_file_all_info *info = &data->fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -703,7 +733,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) { fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; fattr->cf_dtype = DT_BLK; - } else if (symlink) { /* TODO add more reparse tag checks */ + } else if (symlink || reparse_tag == IO_REPARSE_TAG_SYMLINK || + reparse_tag == IO_REPARSE_TAG_NFS) { fattr->cf_mode = S_IFLNK; fattr->cf_dtype = DT_LNK; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { @@ -735,6 +766,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, } } + if (S_ISLNK(fattr->cf_mode)) { + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; + } + fattr->cf_uid = cifs_sb->ctx->linux_uid; fattr->cf_gid = cifs_sb->ctx->linux_gid; } @@ -744,23 +780,28 @@ cifs_get_file_info(struct file *filp) { int rc; unsigned int xid; - FILE_ALL_INFO find_data; + struct cifs_open_info_data data = {}; struct cifs_fattr fattr; struct inode *inode = file_inode(filp); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; + bool symlink = false; + u32 tag = 0; if (!server->ops->query_file_info) return -ENOSYS; xid = get_xid(); - rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); + rc = server->ops->query_file_info(xid, tcon, cfile, &data); switch (rc) { case 0: /* TODO: add support to query reparse tag */ - cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false, - false, 0 /* no reparse tag */); + if (data.symlink_target) { + symlink = true; + tag = IO_REPARSE_TAG_SYMLINK; + } + cifs_open_info_to_fattr(&fattr, &data, inode->i_sb, false, symlink, tag); break; case -EREMOTE: cifs_create_dfs_fattr(&fattr, inode->i_sb); @@ -789,6 +830,7 @@ cifs_get_file_info(struct file *filp) /* if filetype is different, return error */ rc = cifs_fattr_to_inode(inode, &fattr); cgfi_exit: + cifs_free_open_info(&data); free_xid(xid); return rc; } @@ -860,14 +902,9 @@ cifs_backup_query_path_info(int xid, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -static void -cifs_set_fattr_ino(int xid, - struct cifs_tcon *tcon, - struct super_block *sb, - struct inode **inode, - const char *full_path, - FILE_ALL_INFO *data, - struct cifs_fattr *fattr) +static void cifs_set_fattr_ino(int xid, struct cifs_tcon *tcon, struct super_block *sb, + struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct cifs_fattr *fattr) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct TCP_Server_Info *server = tcon->ses->server; @@ -885,11 +922,8 @@ cifs_set_fattr_ino(int xid, * If we have an inode pass a NULL tcon to ensure we don't * make a round trip to the server. This only works for SMB2+. */ - rc = server->ops->get_srv_inum(xid, - *inode ? NULL : tcon, - cifs_sb, full_path, - &fattr->cf_uniqueid, - data); + rc = server->ops->get_srv_inum(xid, *inode ? NULL : tcon, cifs_sb, full_path, + &fattr->cf_uniqueid, data); if (rc) { /* * If that fails reuse existing ino or generate one @@ -913,7 +947,7 @@ cifs_set_fattr_ino(int xid, } else { /* make an ino by hashing the UNC */ fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO; - fattr->cf_uniqueid = simple_hashstr(tcon->treeName); + fattr->cf_uniqueid = simple_hashstr(tcon->tree_name); } } } @@ -923,14 +957,10 @@ static inline bool is_inode_cache_good(struct inode *ino) return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0; } -int -cifs_get_inode_info(struct inode **inode, - const char *full_path, - FILE_ALL_INFO *in_data, - struct super_block *sb, int xid, - const struct cifs_fid *fid) +int cifs_get_inode_info(struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct super_block *sb, int xid, + const struct cifs_fid *fid) { - struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct tcon_link *tlink; @@ -938,8 +968,7 @@ cifs_get_inode_info(struct inode **inode, bool adjust_tz = false; struct cifs_fattr fattr = {0}; bool is_reparse_point = false; - FILE_ALL_INFO *data = in_data; - FILE_ALL_INFO *tmp_data = NULL; + struct cifs_open_info_data tmp_data = {}; void *smb1_backup_rsp_buf = NULL; int rc = 0; int tmprc = 0; @@ -960,21 +989,15 @@ cifs_get_inode_info(struct inode **inode, cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto out; } - tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (!tmp_data) { - rc = -ENOMEM; - goto out; - } - rc = server->ops->query_path_info(xid, tcon, cifs_sb, - full_path, tmp_data, - &adjust_tz, &is_reparse_point); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data, + &adjust_tz, &is_reparse_point); #ifdef CONFIG_CIFS_DFS_UPCALL if (rc == -ENOENT && is_tcon_dfs(tcon)) rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb, full_path); #endif - data = tmp_data; + data = &tmp_data; } /* @@ -988,14 +1011,24 @@ cifs_get_inode_info(struct inode **inode, * since we have to check if its reparse tag matches a known * special file type e.g. symlink or fifo or char etc. */ - if ((le32_to_cpu(data->Attributes) & ATTR_REPARSE) && - server->ops->query_reparse_tag) { - rc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, - full_path, &reparse_tag); - cifs_dbg(FYI, "reparse tag 0x%x\n", reparse_tag); + if (is_reparse_point && data->symlink_target) { + reparse_tag = IO_REPARSE_TAG_SYMLINK; + } else if ((le32_to_cpu(data->fi.Attributes) & ATTR_REPARSE) && + server->ops->query_reparse_tag) { + tmprc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, full_path, + &reparse_tag); + if (tmprc) + cifs_dbg(FYI, "%s: query_reparse_tag: rc = %d\n", __func__, tmprc); + if (server->ops->query_symlink) { + tmprc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, + &data->symlink_target, + is_reparse_point); + if (tmprc) + cifs_dbg(FYI, "%s: query_symlink: rc = %d\n", __func__, + tmprc); + } } - cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, - is_reparse_point, reparse_tag); + cifs_open_info_to_fattr(&fattr, data, sb, adjust_tz, is_reparse_point, reparse_tag); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1014,18 +1047,20 @@ cifs_get_inode_info(struct inode **inode, */ if (backup_cred(cifs_sb) && is_smb1_server(server)) { /* for easier reading */ + FILE_ALL_INFO *fi; FILE_DIRECTORY_INFO *fdi; SEARCH_ID_FULL_DIR_INFO *si; rc = cifs_backup_query_path_info(xid, tcon, sb, full_path, &smb1_backup_rsp_buf, - &data); + &fi); if (rc) goto out; - fdi = (FILE_DIRECTORY_INFO *)data; - si = (SEARCH_ID_FULL_DIR_INFO *)data; + move_cifs_info_to_smb2(&data->fi, fi); + fdi = (FILE_DIRECTORY_INFO *)fi; + si = (SEARCH_ID_FULL_DIR_INFO *)fi; cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb); fattr.cf_uniqueid = le64_to_cpu(si->UniqueId); @@ -1123,7 +1158,8 @@ handle_mnt_opt: out: cifs_buf_release(smb1_backup_rsp_buf); cifs_put_tlink(tlink); - kfree(tmp_data); + cifs_free_open_info(&tmp_data); + kfree(fattr.cf_symlink_target); return rc; } @@ -1138,7 +1174,7 @@ smb311_posix_get_inode_info(struct inode **inode, bool adjust_tz = false; struct cifs_fattr fattr = {0}; bool symlink = false; - struct smb311_posix_qinfo *data = NULL; + struct cifs_open_info_data data = {}; int rc = 0; int tmprc = 0; @@ -1155,15 +1191,9 @@ smb311_posix_get_inode_info(struct inode **inode, cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto out; } - data = kmalloc(sizeof(struct smb311_posix_qinfo), GFP_KERNEL); - if (!data) { - rc = -ENOMEM; - goto out; - } - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, - full_path, data, - &adjust_tz, &symlink); + rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, &adjust_tz, + &symlink); /* * 2. Convert it to internal cifs metadata (fattr) @@ -1171,7 +1201,7 @@ smb311_posix_get_inode_info(struct inode **inode, switch (rc) { case 0: - smb311_posix_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); + smb311_posix_info_to_fattr(&fattr, &data, sb, adjust_tz, symlink); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1228,7 +1258,8 @@ smb311_posix_get_inode_info(struct inode **inode, } out: cifs_put_tlink(tlink); - kfree(data); + cifs_free_open_info(&data); + kfree(fattr.cf_symlink_target); return rc; } @@ -2265,13 +2296,13 @@ cifs_dentry_needs_reval(struct dentry *dentry) return true; if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { - mutex_lock(&cfid->fid_mutex); + spin_lock(&cfid->fid_lock); if (cfid->time && cifs_i->time > cfid->time) { - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); return false; } - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); } /* @@ -2327,7 +2358,7 @@ cifs_invalidate_mapping(struct inode *inode) static int cifs_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; @@ -2345,7 +2376,7 @@ cifs_revalidate_mapping(struct inode *inode) return 0; rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (rc) return rc; diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index b6e6e5d6c8dd..89d5fa887364 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -484,12 +484,35 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(tlink); if (tcon && tcon->ses->server->ops->notify) { rc = tcon->ses->server->ops->notify(xid, - filep, (void __user *)arg); + filep, (void __user *)arg, + false /* no ret data */); cifs_dbg(FYI, "ioctl notify rc %d\n", rc); } else rc = -EOPNOTSUPP; cifs_put_tlink(tlink); break; + case CIFS_IOC_NOTIFY_INFO: + if (!S_ISDIR(inode->i_mode)) { + /* Notify can only be done on directories */ + rc = -EOPNOTSUPP; + break; + } + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + break; + } + tcon = tlink_tcon(tlink); + if (tcon && tcon->ses->server->ops->notify) { + rc = tcon->ses->server->ops->notify(xid, + filep, (void __user *)arg, + true /* return details */); + cifs_dbg(FYI, "ioctl notify info rc %d\n", rc); + } else + rc = -EOPNOTSUPP; + cifs_put_tlink(tlink); + break; case CIFS_IOC_SHUTDOWN: rc = cifs_shutdown(inode->i_sb, arg); break; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 6803cb27eecc..bd374feeccaa 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -38,29 +38,28 @@ static int symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) { int rc; - struct crypto_shash *md5 = NULL; - struct sdesc *sdescmd5 = NULL; + struct shash_desc *md5 = NULL; - rc = cifs_alloc_hash("md5", &md5, &sdescmd5); + rc = cifs_alloc_hash("md5", &md5); if (rc) goto symlink_hash_err; - rc = crypto_shash_init(&sdescmd5->shash); + rc = crypto_shash_init(md5); if (rc) { cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } - rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_update(md5, link_str, link_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); goto symlink_hash_err; } - rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + rc = crypto_shash_final(md5, md5_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: - cifs_free_hash(&md5, &sdescmd5); + cifs_free_hash(&md5); return rc; } @@ -202,40 +201,6 @@ out: return rc; } -static int -query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const unsigned char *path, - char **symlinkinfo) -{ - int rc; - u8 *buf = NULL; - unsigned int link_len = 0; - unsigned int bytes_read = 0; - - buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (tcon->ses->server->ops->query_mf_symlink) - rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, - cifs_sb, path, buf, &bytes_read); - else - rc = -ENOSYS; - - if (rc) - goto out; - - if (bytes_read == 0) { /* not a symlink */ - rc = -EINVAL; - goto out; - } - - rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo); -out: - kfree(buf); - return rc; -} - int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, @@ -245,6 +210,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, u8 *buf = NULL; unsigned int link_len = 0; unsigned int bytes_read = 0; + char *symlink = NULL; if (!couldbe_mf_symlink(fattr)) /* it's not a symlink */ @@ -266,7 +232,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (bytes_read == 0) /* not a symlink */ goto out; - rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL); + rc = parse_mf_symlink(buf, bytes_read, &link_len, &symlink); if (rc == -EINVAL) { /* it's not a symlink */ rc = 0; @@ -281,6 +247,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, fattr->cf_mode &= ~S_IFMT; fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO; fattr->cf_dtype = DT_LNK; + fattr->cf_symlink_target = symlink; out: kfree(buf); return rc; @@ -600,75 +567,6 @@ cifs_hl_exit: return rc; } -const char * -cifs_get_link(struct dentry *direntry, struct inode *inode, - struct delayed_call *done) -{ - int rc = -ENOMEM; - unsigned int xid; - const char *full_path; - void *page; - char *target_path = NULL; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct tcon_link *tlink = NULL; - struct cifs_tcon *tcon; - struct TCP_Server_Info *server; - - if (!direntry) - return ERR_PTR(-ECHILD); - - xid = get_xid(); - - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - free_xid(xid); - return ERR_CAST(tlink); - } - tcon = tlink_tcon(tlink); - server = tcon->ses->server; - - page = alloc_dentry_path(); - full_path = build_path_from_dentry(direntry, page); - if (IS_ERR(full_path)) { - free_xid(xid); - cifs_put_tlink(tlink); - free_dentry_path(page); - return ERR_CAST(full_path); - } - - cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); - - rc = -EACCES; - /* - * First try Minshall+French Symlinks, if configured - * and fallback to UNIX Extensions Symlinks. - */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - rc = query_mf_symlink(xid, tcon, cifs_sb, full_path, - &target_path); - - if (rc != 0 && server->ops->query_symlink) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); - bool reparse_point = false; - - if (cifsi->cifsAttrs & ATTR_REPARSE) - reparse_point = true; - - rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &target_path, reparse_point); - } - - free_dentry_path(page); - free_xid(xid); - cifs_put_tlink(tlink); - if (rc != 0) { - kfree(target_path); - return ERR_PTR(rc); - } - set_delayed_call(done, kfree_link, target_path); - return target_path; -} - int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, struct dentry *direntry, const char *symname) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 87f60f736731..3e68d8208cf5 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -117,8 +117,8 @@ tconInfoAlloc(void) ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL); if (!ret_buf) return NULL; - ret_buf->cfid = init_cached_dir(); - if (!ret_buf->cfid) { + ret_buf->cfids = init_cached_dirs(); + if (!ret_buf->cfids) { kfree(ret_buf); return NULL; } @@ -144,7 +144,7 @@ tconInfoFree(struct cifs_tcon *tcon) cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n"); return; } - free_cached_dir(tcon); + free_cached_dirs(tcon->cfids); atomic_dec(&tconInfoAllocCount); kfree(tcon->nativeFileSystem); kfree_sensitive(tcon->password); @@ -400,6 +400,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) { struct smb_hdr *buf = (struct smb_hdr *)buffer; struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsInodeInfo *pCifsInode; @@ -464,9 +465,12 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) return false; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(srv) ? srv->primary_server : srv; + /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &srv->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid != buf->Tid) continue; @@ -525,7 +529,7 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; cifs_sb->mnt_cifs_serverino_autodisabled = true; cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s\n", - tcon ? tcon->treeName : "new server"); + tcon ? tcon->tree_name : "new server"); cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS)\n"); cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n"); @@ -824,7 +828,7 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) free_dentry_path(page); } -/* parses DFS refferal V3 structure +/* parses DFS referral V3 structure * caller is responsible for freeing target_nodes * returns: * - on success - 0 @@ -1071,59 +1075,58 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) /** * cifs_alloc_hash - allocate hash and hash context together * @name: The name of the crypto hash algo - * @shash: Where to put the pointer to the hash algo - * @sdesc: Where to put the pointer to the hash descriptor + * @sdesc: SHASH descriptor where to put the pointer to the hash TFM * * The caller has to make sure @sdesc is initialized to either NULL or - * a valid context. Both can be freed via cifs_free_hash(). + * a valid context. It can be freed via cifs_free_hash(). */ int -cifs_alloc_hash(const char *name, - struct crypto_shash **shash, struct sdesc **sdesc) +cifs_alloc_hash(const char *name, struct shash_desc **sdesc) { int rc = 0; - size_t size; + struct crypto_shash *alg = NULL; - if (*sdesc != NULL) + if (*sdesc) return 0; - *shash = crypto_alloc_shash(name, 0, 0); - if (IS_ERR(*shash)) { - cifs_dbg(VFS, "Could not allocate crypto %s\n", name); - rc = PTR_ERR(*shash); - *shash = NULL; + alg = crypto_alloc_shash(name, 0, 0); + if (IS_ERR(alg)) { + cifs_dbg(VFS, "Could not allocate shash TFM '%s'\n", name); + rc = PTR_ERR(alg); *sdesc = NULL; return rc; } - size = sizeof(struct shash_desc) + crypto_shash_descsize(*shash); - *sdesc = kmalloc(size, GFP_KERNEL); + *sdesc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(alg), GFP_KERNEL); if (*sdesc == NULL) { - cifs_dbg(VFS, "no memory left to allocate crypto %s\n", name); - crypto_free_shash(*shash); - *shash = NULL; + cifs_dbg(VFS, "no memory left to allocate shash TFM '%s'\n", name); + crypto_free_shash(alg); return -ENOMEM; } - (*sdesc)->shash.tfm = *shash; + (*sdesc)->tfm = alg; return 0; } /** * cifs_free_hash - free hash and hash context together - * @shash: Where to find the pointer to the hash algo - * @sdesc: Where to find the pointer to the hash descriptor + * @sdesc: Where to find the pointer to the hash TFM * - * Freeing a NULL hash or context is safe. + * Freeing a NULL descriptor is safe. */ void -cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) +cifs_free_hash(struct shash_desc **sdesc) { - kfree(*sdesc); + if (unlikely(!sdesc) || !*sdesc) + return; + + if ((*sdesc)->tfm) { + crypto_free_shash((*sdesc)->tfm); + (*sdesc)->tfm = NULL; + } + + kfree_sensitive(*sdesc); *sdesc = NULL; - if (*shash) - crypto_free_shash(*shash); - *shash = NULL; } /** @@ -1328,7 +1331,7 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, char *treename, *dfspath, sep; int treenamelen, linkpathlen, rc; - treename = tcon->treeName; + treename = tcon->tree_name; /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL * messages MUST be encoded with exactly one leading backslash, not two * leading backslashes. diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 8e060c00c969..2d75ba5aaa8a 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -844,17 +844,34 @@ static bool emit_cached_dirents(struct cached_dirents *cde, struct dir_context *ctx) { struct cached_dirent *dirent; - int rc; + bool rc; list_for_each_entry(dirent, &cde->entries, entry) { - if (ctx->pos >= dirent->pos) + /* + * Skip all early entries prior to the current lseek() + * position. + */ + if (ctx->pos > dirent->pos) continue; + /* + * We recorded the current ->pos value for the dirent + * when we stored it in the cache. + * However, this sequence of ->pos values may have holes + * in it, for example dot-dirs returned from the server + * are suppressed. + * Handle this bu forcing ctx->pos to be the same as the + * ->pos of the current dirent we emit from the cache. + * This means that when we emit these entries from the cache + * we now emit them with the same ->pos value as in the + * initial scan. + */ ctx->pos = dirent->pos; rc = dir_emit(ctx, dirent->name, dirent->namelen, dirent->fattr.cf_uniqueid, dirent->fattr.cf_dtype); if (!rc) return rc; + ctx->pos++; } return true; } @@ -994,6 +1011,8 @@ static int cifs_filldir(char *find_entry, struct file *file, cifs_unix_basic_to_fattr(&fattr, &((FILE_UNIX_INFO *)find_entry)->basic, cifs_sb); + if (S_ISLNK(fattr.cf_mode)) + fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; break; case SMB_FIND_FILE_INFO_STANDARD: cifs_std_info_to_fattr(&fattr, @@ -1202,10 +1221,10 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) ctx->pos, tmp_buf); cifs_save_resume_key(current_entry, cifsFile); break; - } else - current_entry = - nxt_dir_entry(current_entry, end_of_smb, - cifsFile->srch_inf.info_level); + } + current_entry = + nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); } kfree(tmp_buf); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3af3b05b6c74..92e4278ec35d 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -496,6 +496,7 @@ out: cifs_put_tcp_session(chan->server, 0); } + free_xid(xid); return rc; } @@ -601,11 +602,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* BB FIXME add check that strings total less than 335 or will need to send them as arrays */ - /* unicode strings, must be word aligned before the call */ -/* if ((long) bcc_ptr % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } */ /* copy user */ if (ses->user_name == NULL) { /* null user mount */ @@ -1213,10 +1209,18 @@ out_free_smb_buf: static void sess_free_buffer(struct sess_data *sess_data) { + struct kvec *iov = sess_data->iov; + + /* + * Zero the session data before freeing, as it might contain sensitive info (keys, etc). + * Note that iov[1] is already freed by caller. + */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base) + memzero_explicit(iov[0].iov_base, iov[0].iov_len); - free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + free_rsp_buf(sess_data->buf0_type, iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; - kfree(sess_data->iov[2].iov_base); + kfree_sensitive(iov[2].iov_base); } static int @@ -1318,7 +1322,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) } if (ses->capabilities & CAP_UNICODE) { - if (sess_data->iov[0].iov_len % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1358,7 +1362,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1374,7 +1378,7 @@ out: sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; } @@ -1442,8 +1446,7 @@ sess_auth_kerberos(struct sess_data *sess_data) if (ses->capabilities & CAP_UNICODE) { /* unicode strings must be word aligned */ - if ((sess_data->iov[0].iov_len - + sess_data->iov[1].iov_len) % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1494,7 +1497,7 @@ sess_auth_kerberos(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1513,7 +1516,7 @@ out: sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; } @@ -1546,7 +1549,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) bcc_ptr = sess_data->iov[2].iov_base; /* unicode strings must be word aligned */ - if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1648,7 +1651,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data); @@ -1658,9 +1661,9 @@ out: } /* Else error. Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->func = NULL; @@ -1747,7 +1750,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1759,7 +1762,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) } out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data); @@ -1767,9 +1770,9 @@ out: rc = sess_establish_session(sess_data); /* Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->func = NULL; @@ -1845,7 +1848,7 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, rc = sess_data->result; out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index f36b2d2d40ca..50480751e521 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -542,31 +542,32 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink) +static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjustTZ, bool *symlink) { int rc; + FILE_ALL_INFO fi = {}; *symlink = false; /* could do find first instead but this returns more info */ - rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, + cifs_remap(cifs_sb)); /* * BB optimize code so we do not make the above call when server claims * no NT SMB support and the above call failed at least once - set flag * in tcon or mount. */ if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { - rc = SMBQueryInformation(xid, tcon, full_path, data, - cifs_sb->local_nls, + rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (!rc) + move_cifs_info_to_smb2(&data->fi, &fi); *adjustTZ = true; } - if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) { + if (!rc && (le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) { int tmprc; int oplock = 0; struct cifs_fid fid; @@ -592,10 +593,9 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - u64 *uniqueid, FILE_ALL_INFO *data) +static int cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, struct cifs_open_info_data *unused) { /* * We can not use the IndexNumber field by default from Windows or @@ -613,11 +613,22 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, cifs_remap(cifs_sb)); } -static int -cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_fid *fid, FILE_ALL_INFO *data) +static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data) { - return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); + int rc; + FILE_ALL_INFO fi = {}; + + if (cfile->symlink_target) { + data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!data->symlink_target) + return -ENOMEM; + } + + rc = CIFSSMBQFileInfo(xid, tcon, cfile->fid.netfid, &fi); + if (!rc) + move_cifs_info_to_smb2(&data->fi, &fi); + return rc; } static void @@ -702,19 +713,20 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path, cifsInode->cifsAttrs = dosattrs; } -static int -cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf) +static int cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf) { + FILE_ALL_INFO *fi = buf; + if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS)) return SMBLegacyOpen(xid, oparms->tcon, oparms->path, oparms->disposition, oparms->desired_access, oparms->create_options, - &oparms->fid->netfid, oplock, buf, + &oparms->fid->netfid, oplock, fi, oparms->cifs_sb->local_nls, cifs_remap(oparms->cifs_sb)); - return CIFS_open(xid, oparms, oplock, buf); + return CIFS_open(xid, oparms, oplock, fi); } static void diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 9dfd2dd612c2..ffbd9a99fc12 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -20,40 +20,125 @@ #include "cifs_unicode.h" #include "fscache.h" #include "smb2proto.h" +#include "smb2status.h" -int -smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf) +static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) +{ + struct smb2_err_rsp *err = iov->iov_base; + struct smb2_symlink_err_rsp *sym = ERR_PTR(-EINVAL); + u32 len; + + if (err->ErrorContextCount) { + struct smb2_error_context_rsp *p, *end; + + len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp, + ErrorContextData) + + sizeof(struct smb2_symlink_err_rsp)); + if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err)) + return ERR_PTR(-EINVAL); + + p = (struct smb2_error_context_rsp *)err->ErrorData; + end = (struct smb2_error_context_rsp *)((u8 *)err + iov->iov_len); + do { + if (le32_to_cpu(p->ErrorId) == SMB2_ERROR_ID_DEFAULT) { + sym = (struct smb2_symlink_err_rsp *)&p->ErrorContextData; + break; + } + cifs_dbg(FYI, "%s: skipping unhandled error context: 0x%x\n", + __func__, le32_to_cpu(p->ErrorId)); + + len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8); + p = (struct smb2_error_context_rsp *)((u8 *)&p->ErrorContextData + len); + } while (p < end); + } else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) && + iov->iov_len >= SMB2_SYMLINK_STRUCT_SIZE) { + sym = (struct smb2_symlink_err_rsp *)err->ErrorData; + } + + if (!IS_ERR(sym) && (le32_to_cpu(sym->SymLinkErrorTag) != SYMLINK_ERROR_TAG || + le32_to_cpu(sym->ReparseTag) != IO_REPARSE_TAG_SYMLINK)) + sym = ERR_PTR(-EINVAL); + + return sym; +} + +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path) +{ + struct smb2_symlink_err_rsp *sym; + unsigned int sub_offs, sub_len; + unsigned int print_offs, print_len; + char *s; + + if (!cifs_sb || !iov || !iov->iov_base || !iov->iov_len || !path) + return -EINVAL; + + sym = symlink_data(iov); + if (IS_ERR(sym)) + return PTR_ERR(sym); + + sub_len = le16_to_cpu(sym->SubstituteNameLength); + sub_offs = le16_to_cpu(sym->SubstituteNameOffset); + print_len = le16_to_cpu(sym->PrintNameLength); + print_offs = le16_to_cpu(sym->PrintNameOffset); + + if (iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offs + sub_len || + iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + print_offs + print_len) + return -EINVAL; + + s = cifs_strndup_from_utf16((char *)sym->PathBuffer + sub_offs, sub_len, true, + cifs_sb->local_nls); + if (!s) + return -ENOMEM; + convert_delimiter(s, '/'); + cifs_dbg(FYI, "%s: symlink target: %s\n", __func__, s); + + *path = s; + return 0; +} + +int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf) { int rc; __le16 *smb2_path; - struct smb2_file_all_info *smb2_data = NULL; __u8 smb2_oplock; + struct cifs_open_info_data *data = buf; + struct smb2_file_all_info file_info = {}; + struct smb2_file_all_info *smb2_data = data ? &file_info : NULL; + struct kvec err_iov = {}; + int err_buftype = CIFS_NO_BUFFER; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); - if (smb2_path == NULL) { - rc = -ENOMEM; - goto out; - } - - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) { - rc = -ENOMEM; - goto out; - } + if (smb2_path == NULL) + return -ENOMEM; oparms->desired_access |= FILE_READ_ATTRIBUTES; smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; - rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, - NULL, NULL); + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, + &err_buftype); + if (rc && data) { + struct smb2_hdr *hdr = err_iov.iov_base; + + if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER)) + rc = -ENOMEM; + else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov, + &data->symlink_target); + if (!rc) { + memset(smb2_data, 0, sizeof(*smb2_data)); + oparms->create_options |= OPEN_REPARSE_POINT; + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, + NULL, NULL, NULL); + oparms->create_options &= ~OPEN_REPARSE_POINT; + } + } + } + if (rc) goto out; - if (oparms->tcon->use_resilient) { /* default timeout is 0, servers pick default (120 seconds) */ nr_ioctl_req.Timeout = @@ -73,7 +158,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, rc = 0; } - if (buf) { + if (smb2_data) { /* if open response does not have IndexNumber field - get it */ if (smb2_data->IndexNumber == 0) { rc = SMB2_get_srv_num(xid, oparms->tcon, @@ -89,12 +174,12 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, rc = 0; } } - move_smb2_info_to_cifs(buf, smb2_data); + memcpy(&data->fi, smb2_data, sizeof(data->fi)); } *oplock = smb2_oplock; out: - kfree(smb2_data); + free_rsp_buf(err_buftype, err_iov.iov_base); kfree(smb2_path); return rc; } diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index b83f59051b26..68e08c85fbb8 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -24,6 +24,7 @@ #include "smb2pdu.h" #include "smb2proto.h" #include "cached_dir.h" +#include "smb2status.h" static void free_set_inf_compound(struct smb_rqst *rqst) @@ -50,13 +51,15 @@ struct cop_vars { /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. + * + * If passing @err_iov and @err_buftype, ensure to make them both large enough (>= 3) to hold all + * error responses. Caller is also responsible for freeing them up. */ -static int -smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 desired_access, __u32 create_disposition, - __u32 create_options, umode_t mode, void *ptr, int command, - struct cifsFileInfo *cfile) +static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + __u32 desired_access, __u32 create_disposition, __u32 create_options, + umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, + struct kvec *err_iov, int *err_buftype) { struct cop_vars *vars = NULL; struct kvec *rsp_iov; @@ -70,6 +73,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, int num_rqst = 0; int resp_buftype[3]; struct smb2_query_info_rsp *qi_rsp = NULL; + struct cifs_open_info_data *idata; int flags = 0; __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0}; unsigned int size[2]; @@ -379,20 +383,25 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_open_free(&rqst[0]); if (rc == -EREMCHG) { - pr_warn_once("server share %s deleted\n", tcon->treeName); + pr_warn_once("server share %s deleted\n", tcon->tree_name); tcon->need_reconnect = true; } switch (command) { case SMB2_OP_QUERY_INFO: + idata = ptr; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } if (rc == 0) { qi_rsp = (struct smb2_query_info_rsp *) rsp_iov[1].iov_base; rc = smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(struct smb2_file_all_info), - ptr); + &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi); } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); @@ -406,13 +415,20 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, tcon->tid); break; case SMB2_OP_POSIX_QUERY_INFO: + idata = ptr; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } if (rc == 0) { qi_rsp = (struct smb2_query_info_rsp *) rsp_iov[1].iov_base; rc = smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(struct smb311_posix_qinfo) /* add SIDs */, ptr); + &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */, + (char *)&idata->posix_fi); } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); @@ -477,42 +493,33 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, free_set_inf_compound(rqst); break; } - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + + if (rc && err_iov && err_buftype) { + memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov)); + memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype)); + } else { + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + } kfree(vars); return rc; } -void -move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) -{ - memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src); - dst->CurrentByteOffset = src->CurrentByteOffset; - dst->Mode = src->Mode; - dst->AlignmentRequirement = src->AlignmentRequirement; - dst->IndexNumber1 = 0; /* we don't use it */ -} - -int -smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjust_tz, bool *reparse) +int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) { int rc; - struct smb2_file_all_info *smb2_data; __u32 create_options = 0; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; + struct kvec err_iov[3] = {}; + int err_buftype[3] = {}; *adjust_tz = false; *reparse = false; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - if (strcmp(full_path, "")) rc = -ENOENT; else @@ -520,63 +527,58 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* If it is a root and its handle is cached then use it */ if (!rc) { if (cfid->file_all_info_is_valid) { - move_smb2_info_to_cifs(data, - &cfid->file_all_info); + memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi)); } else { - rc = SMB2_query_info(xid, tcon, - cfid->fid.persistent_fid, - cfid->fid.volatile_fid, smb2_data); - if (!rc) - move_smb2_info_to_cifs(data, smb2_data); + rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid, &data->fi); } close_cached_dir(cfid); - goto out; + return rc; } cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, + err_iov, err_buftype); if (rc == -EOPNOTSUPP) { + if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && + ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && + ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); + if (rc) + goto out; + } *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - smb2_data, SMB2_OP_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, create_options, ACL_NO_MODE, data, + SMB2_OP_QUERY_INFO, cfile, NULL, NULL); } - if (rc) - goto out; - move_smb2_info_to_cifs(data, smb2_data); out: - kfree(smb2_data); + free_rsp_buf(err_buftype[0], err_iov[0].iov_base); + free_rsp_buf(err_buftype[1], err_iov[1].iov_base); + free_rsp_buf(err_buftype[2], err_iov[2].iov_base); return rc; } -int -smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct smb311_posix_qinfo *data, bool *adjust_tz, bool *reparse) +int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) { int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; - struct smb311_posix_qinfo *smb2_data; + struct kvec err_iov[3] = {}; + int err_buftype[3] = {}; *adjust_tz = false; *reparse = false; - /* BB TODO: Make struct larger when add support for parsing owner SIDs */ - smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo), - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - /* * BB TODO: Add support for using the cached root handle. * Create SMB2_query_posix_info worker function to do non-compounded query @@ -585,29 +587,32 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, + err_iov, err_buftype); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ + if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && + ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && + ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); + if (rc) + goto out; + } *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, create_options, ACL_NO_MODE, data, + SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL); } - if (rc) - goto out; - - /* TODO: will need to allow for the 2 SIDs when add support for getting owner UID/GID */ - memcpy(data, smb2_data, sizeof(struct smb311_posix_qinfo)); out: - kfree(smb2_data); + free_rsp_buf(err_buftype[0], err_iov[0].iov_base); + free_rsp_buf(err_buftype[1], err_iov[1].iov_base); + free_rsp_buf(err_buftype[2], err_iov[2].iov_base); return rc; } @@ -619,7 +624,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, - NULL); + NULL, NULL, NULL); } void @@ -641,7 +646,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, - &data, SMB2_OP_SET_INFO, cfile); + &data, SMB2_OP_SET_INFO, cfile, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -650,9 +655,10 @@ int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { + drop_cached_dir_by_name(xid, tcon, name, cifs_sb); return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE, - NULL, SMB2_OP_RMDIR, NULL); + NULL, SMB2_OP_RMDIR, NULL, NULL, NULL); } int @@ -661,7 +667,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL); } static int @@ -680,7 +686,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, - command, cfile); + command, cfile, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -693,6 +699,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, { struct cifsFileInfo *cfile; + drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb); cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile); return smb2_set_path_attr(xid, tcon, from_name, to_name, @@ -720,7 +727,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); return smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, - &eof, SMB2_OP_SET_EOF, cfile); + &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL); } int @@ -746,7 +753,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, - 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile); + 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile, + NULL, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index d73e5672aac4..572293c18e16 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -135,6 +135,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, int smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) { + struct TCP_Server_Info *pserver; struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)shdr; int hdr_size = sizeof(struct smb2_hdr); @@ -143,6 +144,9 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) __u32 calc_len; /* calculated length */ __u64 mid; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + /* * Add function to do table lookup of StructureSize by command * ie Validate the wct via smb2_struct_sizes table above @@ -155,7 +159,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(iter, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(iter, &pserver->smb_ses_list, smb_ses_list) { if (iter->Suid == le64_to_cpu(thdr->SessionId)) { ses = iter; break; @@ -248,7 +252,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) * Some windows servers (win2016) will pad also the final * PDU in a compound to 8 bytes. */ - if (((calc_len + 7) & ~7) == len) + if (ALIGN(calc_len, 8) == len) return 0; /* @@ -608,51 +612,52 @@ smb2_tcon_find_pending_open_lease(struct cifs_tcon *tcon, } static bool -smb2_is_valid_lease_break(char *buffer) +smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) { struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; - struct TCP_Server_Info *server; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifs_pending_open *open; cifs_dbg(FYI, "Checking for lease break\n"); + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { - spin_lock(&tcon->open_file_lock); - cifs_stats_inc( - &tcon->stats.cifs_stats.num_oplock_brks); - if (smb2_tcon_has_lease(tcon, rsp)) { - spin_unlock(&tcon->open_file_lock); - spin_unlock(&cifs_tcp_ses_lock); - return true; - } - open = smb2_tcon_find_pending_open_lease(tcon, - rsp); - if (open) { - __u8 lease_key[SMB2_LEASE_KEY_SIZE]; - struct tcon_link *tlink; - - tlink = cifs_get_tlink(open->tlink); - memcpy(lease_key, open->lease_key, - SMB2_LEASE_KEY_SIZE); - spin_unlock(&tcon->open_file_lock); - spin_unlock(&cifs_tcp_ses_lock); - smb2_queue_pending_open_break(tlink, - lease_key, - rsp->NewLeaseState); - return true; - } + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + spin_lock(&tcon->open_file_lock); + cifs_stats_inc( + &tcon->stats.cifs_stats.num_oplock_brks); + if (smb2_tcon_has_lease(tcon, rsp)) { spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + open = smb2_tcon_find_pending_open_lease(tcon, + rsp); + if (open) { + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; + struct tcon_link *tlink; + + tlink = cifs_get_tlink(open->tlink); + memcpy(lease_key, open->lease_key, + SMB2_LEASE_KEY_SIZE); + spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_tcp_ses_lock); + smb2_queue_pending_open_break(tlink, + lease_key, + rsp->NewLeaseState); + return true; + } + spin_unlock(&tcon->open_file_lock); - if (cached_dir_lease_break(tcon, rsp->LeaseKey)) { - spin_unlock(&cifs_tcp_ses_lock); - return true; - } + if (cached_dir_lease_break(tcon, rsp->LeaseKey)) { + spin_unlock(&cifs_tcp_ses_lock); + return true; } } } @@ -671,6 +676,7 @@ bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) { struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsInodeInfo *cinode; @@ -684,16 +690,19 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) - return smb2_is_valid_lease_break(buffer); + return smb2_is_valid_lease_break(buffer, server); else return false; } cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel); + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); @@ -870,8 +879,8 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, struct kvec *iov, int nvec) { int i, rc; - struct sdesc *d; struct smb2_hdr *hdr; + struct shash_desc *sha512 = NULL; hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ @@ -901,14 +910,14 @@ ok: if (rc) return rc; - d = server->secmech.sdescsha512; - rc = crypto_shash_init(&d->shash); + sha512 = server->secmech.sha512; + rc = crypto_shash_init(sha512); if (rc) { cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__); return rc; } - rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash, + rc = crypto_shash_update(sha512, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); @@ -916,8 +925,7 @@ ok: } for (i = 0; i < nvec; i++) { - rc = crypto_shash_update(&d->shash, - iov[i].iov_base, iov[i].iov_len); + rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); @@ -925,7 +933,7 @@ ok: } } - rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash); + rc = crypto_shash_final(sha512, ses->preauth_sha_hash); if (rc) { cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n", __func__); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 421be43af425..880cd494afea 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -512,8 +512,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, - size_t buf_len, - struct cifs_ses *ses) + size_t buf_len, struct cifs_ses *ses, bool in_mount) { struct network_interface_info_ioctl_rsp *p; struct sockaddr_in *addr4; @@ -531,6 +530,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, p = buf; spin_lock(&ses->iface_lock); + ses->iface_count = 0; /* * Go through iface_list and do kref_put to remove * any unused ifaces. ifaces in use will be removed @@ -543,6 +543,21 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, } spin_unlock(&ses->iface_lock); + /* + * Samba server e.g. can return an empty interface list in some cases, + * which would only be a problem if we were requesting multichannel + */ + if (bytes_left == 0) { + /* avoid spamming logs every 10 minutes, so log only in mount */ + if ((ses->chan_max > 1) && in_mount) + cifs_dbg(VFS, + "multichannel not available\n" + "Empty network interface list returned by server %s\n", + ses->server->hostname); + rc = -EINVAL; + goto out; + } + while (bytes_left >= sizeof(*p)) { memset(&tmp_iface, 0, sizeof(tmp_iface)); tmp_iface.speed = le64_to_cpu(p->LinkSpeed); @@ -637,9 +652,9 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, kref_put(&iface->refcount, release_iface); } else list_add_tail(&info->iface_head, &ses->iface_list); - spin_unlock(&ses->iface_lock); ses->iface_count++; + spin_unlock(&ses->iface_lock); ses->iface_last_update = jiffies; next_iface: nb_iface++; @@ -673,7 +688,7 @@ out: } int -SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount) { int rc; unsigned int ret_data_len = 0; @@ -693,7 +708,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) goto out; } - rc = parse_server_interfaces(out_buf, ret_data_len, ses); + rc = parse_server_interfaces(out_buf, ret_data_len, ses, in_mount); if (rc) goto out; @@ -729,7 +744,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return; - SMB3_request_interfaces(xid, tcon); + SMB3_request_interfaces(xid, tcon, true /* called during mount */); SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_ATTRIBUTE_INFORMATION); @@ -787,7 +802,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid); if (!rc) { - if (cfid->is_valid) { + if (cfid->has_lease) { close_cached_dir(cfid); return 0; } @@ -817,33 +832,25 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - u64 *uniqueid, FILE_ALL_INFO *data) +static int smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, struct cifs_open_info_data *data) { - *uniqueid = le64_to_cpu(data->IndexNumber); + *uniqueid = le64_to_cpu(data->fi.IndexNumber); return 0; } -static int -smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_fid *fid, FILE_ALL_INFO *data) +static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data) { - int rc; - struct smb2_file_all_info *smb2_data; + struct cifs_fid *fid = &cfile->fid; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - - rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, - smb2_data); - if (!rc) - move_smb2_info_to_cifs(data, smb2_data); - kfree(smb2_data); - return rc; + if (cfile->symlink_target) { + data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!data->symlink_target) + return -ENOMEM; + } + return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi); } #ifdef CONFIG_CIFS_XATTR @@ -1327,7 +1334,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); if (rc == -EOPNOTSUPP) { - pr_warn_once("Server share %s does not support copy range\n", tcon->treeName); + pr_warn_once("Server share %s does not support copy range\n", tcon->tree_name); goto req_res_key_exit; } else if (rc) { cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); @@ -2012,9 +2019,10 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, static int smb3_notify(const unsigned int xid, struct file *pfile, - void __user *ioc_buf) + void __user *ioc_buf, bool return_changes) { - struct smb3_notify notify; + struct smb3_notify_info notify; + struct smb3_notify_info __user *pnotify_buf; struct dentry *dentry = pfile->f_path.dentry; struct inode *inode = file_inode(pfile); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -2022,10 +2030,12 @@ smb3_notify(const unsigned int xid, struct file *pfile, struct cifs_fid fid; struct cifs_tcon *tcon; const unsigned char *path; + char *returned_ioctl_info = NULL; void *page = alloc_dentry_path(); __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; int rc = 0; + __u32 ret_len = 0; path = build_path_from_dentry(dentry, page); if (IS_ERR(path)) { @@ -2039,9 +2049,17 @@ smb3_notify(const unsigned int xid, struct file *pfile, goto notify_exit; } - if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify))) { - rc = -EFAULT; - goto notify_exit; + if (return_changes) { + if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify_info))) { + rc = -EFAULT; + goto notify_exit; + } + } else { + if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify))) { + rc = -EFAULT; + goto notify_exit; + } + notify.data_len = 0; } tcon = cifs_sb_master_tcon(cifs_sb); @@ -2058,12 +2076,22 @@ smb3_notify(const unsigned int xid, struct file *pfile, goto notify_exit; rc = SMB2_change_notify(xid, tcon, fid.persistent_fid, fid.volatile_fid, - notify.watch_tree, notify.completion_filter); + notify.watch_tree, notify.completion_filter, + notify.data_len, &returned_ioctl_info, &ret_len); SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc); - + if (return_changes && (ret_len > 0) && (notify.data_len > 0)) { + if (ret_len > notify.data_len) + ret_len = notify.data_len; + pnotify_buf = (struct smb3_notify_info __user *)ioc_buf; + if (copy_to_user(pnotify_buf->notify_data, returned_ioctl_info, ret_len)) + rc = -EFAULT; + else if (copy_to_user(&pnotify_buf->data_len, &ret_len, sizeof(ret_len))) + rc = -EFAULT; + } + kfree(returned_ioctl_info); notify_exit: free_dentry_path(page); kfree(utf16_path); @@ -2274,14 +2302,18 @@ static void smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { struct smb2_hdr *shdr = (struct smb2_hdr *)buf; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; if (shdr->Status != STATUS_NETWORK_NAME_DELETED) return; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) { spin_lock(&tcon->tc_lock); @@ -2289,7 +2321,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", - tcon->treeName); + tcon->tree_name); return; } } @@ -2498,7 +2530,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, if (rc == -EREMCHG) { tcon->need_reconnect = true; pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); } goto qic_exit; } @@ -2814,9 +2846,6 @@ parse_reparse_point(struct reparse_data_buffer *buf, } } -#define SMB2_SYMLINK_STRUCT_SIZE \ - (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) - static int smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, @@ -2828,13 +2857,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct kvec err_iov = {NULL, 0}; - struct smb2_err_rsp *err_buf = NULL; - struct smb2_symlink_err_rsp *symlink; struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - unsigned int sub_len; - unsigned int sub_offset; - unsigned int print_len; - unsigned int print_offset; int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -2951,47 +2974,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, goto querty_exit; } - err_buf = err_iov.iov_base; - if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || - err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) { - rc = -EINVAL; - goto querty_exit; - } - - symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; - if (le32_to_cpu(symlink->SymLinkErrorTag) != SYMLINK_ERROR_TAG || - le32_to_cpu(symlink->ReparseTag) != IO_REPARSE_TAG_SYMLINK) { - rc = -EINVAL; - goto querty_exit; - } - - /* open must fail on symlink - reset rc */ - rc = 0; - sub_len = le16_to_cpu(symlink->SubstituteNameLength); - sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); - print_len = le16_to_cpu(symlink->PrintNameLength); - print_offset = le16_to_cpu(symlink->PrintNameOffset); - - if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { - rc = -EINVAL; - goto querty_exit; - } - - if (err_iov.iov_len < - SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { - rc = -EINVAL; - goto querty_exit; - } - - *target_path = cifs_strndup_from_utf16( - (char *)symlink->PathBuffer + sub_offset, - sub_len, true, cifs_sb->local_nls); - if (!(*target_path)) { - rc = -ENOMEM; - goto querty_exit; - } - convert_delimiter(*target_path, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + rc = smb2_parse_symlink_response(cifs_sb, &err_iov, target_path); querty_exit: cifs_dbg(FYI, "query symlink rc %d\n", rc); @@ -4285,21 +4268,23 @@ init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign) static int smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) { + struct TCP_Server_Info *pserver; struct cifs_ses *ses; u8 *ses_enc_key; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->Suid == ses_id) { - spin_lock(&ses->ses_lock); - ses_enc_key = enc ? ses->smb3encryptionkey : - ses->smb3decryptionkey; - memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); - spin_unlock(&ses->ses_lock); - spin_unlock(&cifs_tcp_ses_lock); - return 0; - } + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) { + spin_lock(&ses->ses_lock); + ses_enc_key = enc ? ses->smb3encryptionkey : + ses->smb3decryptionkey; + memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); + spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); + return 0; } } spin_unlock(&cifs_tcp_ses_lock); @@ -4344,8 +4329,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - tfm = enc ? server->secmech.ccmaesencrypt : - server->secmech.ccmaesdecrypt; + tfm = enc ? server->secmech.enc : server->secmech.dec; if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) @@ -4410,11 +4394,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kfree(iv); + kfree_sensitive(iv); free_sg: - kfree(sg); + kfree_sensitive(sg); free_req: - kfree(req); + kfree_sensitive(req); return rc; } @@ -5102,7 +5086,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); int rc = -EPERM; - FILE_ALL_INFO *buf = NULL; + struct cifs_open_info_data buf = {}; struct cifs_io_parms io_parms = {0}; __u32 oplock = 0; struct cifs_fid fid; @@ -5118,7 +5102,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, * and was used by default in earlier versions of Windows */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - goto out; + return rc; /* * TODO: Add ability to create instead via reparse point. Windows (e.g. @@ -5127,16 +5111,10 @@ smb2_make_node(unsigned int xid, struct inode *inode, */ if (!S_ISCHR(mode) && !S_ISBLK(mode)) - goto out; + return rc; cifs_dbg(FYI, "sfu compat create special file\n"); - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto out; - } - oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; @@ -5151,21 +5129,21 @@ smb2_make_node(unsigned int xid, struct inode *inode, oplock = REQ_OPLOCK; else oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf); if (rc) - goto out; + return rc; /* * BB Do not bother to decode buf since no local inode yet to put * timestamps in, but we can reuse it safely. */ - pdev = (struct win_dev *)buf; + pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; io_parms.offset = 0; io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = buf; + iov[1].iov_base = &buf.fi; iov[1].iov_len = sizeof(struct win_dev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); @@ -5184,8 +5162,8 @@ smb2_make_node(unsigned int xid, struct inode *inode, d_drop(dentry); /* FIXME: add code here to set EAs */ -out: - kfree(buf); + + cifs_free_open_info(&buf); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6352ab32c7e7..a5695748a89b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt) /* * Context Data length must be rounded to multiple of 8 for some servers */ - pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP( - sizeof(struct smb2_signing_capabilities) - - sizeof(struct smb2_neg_context) + - (num_algs * 2 /* sizeof u16 */), 8) * 8); + pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) - + sizeof(struct smb2_neg_context) + + (num_algs * sizeof(u16)), 8)); pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs); pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC); - ctxt_len += 2 /* sizeof le16 */ * num_algs; - ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8; + ctxt_len += sizeof(__le16) * num_algs; + ctxt_len = ALIGN(ctxt_len, 8); return ctxt_len; /* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */ } @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname) /* copy up to max of first 100 bytes of server name to NetName field */ pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp)); /* context size is DataLength + minimal smb2_neg_context */ - return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) + - sizeof(struct smb2_neg_context), 8) * 8; + return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8); } static void @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, * round up total_len of fixed part of SMB3 negotiate request to 8 * byte boundary before adding negotiate contexts */ - *total_len = roundup(*total_len, 8); + *total_len = ALIGN(*total_len, 8); pneg_ctxt = (*total_len) + (char *)req; req->NegotiateContextOffset = cpu_to_le32(*total_len); build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, if (server->compress_algorithm) { build_compression_ctxt((struct smb2_compression_capabilities_context *) pneg_ctxt); - ctxt_len = DIV_ROUND_UP( - sizeof(struct smb2_compression_capabilities_context), - 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; neg_context_count++; @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, if (rc) break; /* offsets must be 8 byte aligned */ - clen = (clen + 7) & ~0x7; + clen = ALIGN(clen, 8); offset += clen + sizeof(struct smb2_neg_context); len_of_ctxts -= clen; } @@ -873,7 +869,7 @@ SMB2_negotiate(const unsigned int xid, struct smb2_negotiate_rsp *rsp; struct kvec iov[1]; struct kvec rsp_iov; - int rc = 0; + int rc; int resp_buftype; int blob_offset, blob_length; char *security_blob; @@ -1169,9 +1165,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Dialects[0] = cpu_to_le16(server->vals->protocol_id); pneg_inbuf->DialectCount = cpu_to_le16(1); - /* structure is big enough for 3 dialects, sending only 1 */ + /* structure is big enough for 4 dialects, sending only 1 */ inbuflen = sizeof(*pneg_inbuf) - - sizeof(pneg_inbuf->Dialects[0]) * 2; + sizeof(pneg_inbuf->Dialects[0]) * 3; } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, @@ -1345,7 +1341,13 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) static void SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) { - free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + struct kvec *iov = sess_data->iov; + + /* iov[1] is already freed by caller */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base) + memzero_explicit(iov[0].iov_base, iov[0].iov_len); + + free_rsp_buf(sess_data->buf0_type, iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; } @@ -1477,6 +1479,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) out_put_spnego_key: key_invalidate(spnego_key); key_put(spnego_key); + if (rc) + kfree_sensitive(ses->auth_key.response); out: sess_data->result = rc; sess_data->func = NULL; @@ -1526,7 +1530,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) &blob_length, ses, server, sess_data->nls_cp); if (rc) - goto out_err; + goto out; if (use_spnego) { /* BB eventually need to add this */ @@ -1573,7 +1577,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) } out: - kfree(ntlmssp_blob); + kfree_sensitive(ntlmssp_blob); SMB2_sess_free_buffer(sess_data); if (!rc) { sess_data->result = 0; @@ -1581,7 +1585,7 @@ out: return; } out_err: - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1657,9 +1661,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) } #endif out: - kfree(ntlmssp_blob); + kfree_sensitive(ntlmssp_blob); SMB2_sess_free_buffer(sess_data); - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1737,7 +1741,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, cifs_server_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; } @@ -1930,7 +1934,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); - strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) @@ -1973,6 +1977,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if (!ses || !(ses->server)) return -EIO; + trace_smb3_tdis_enter(xid, tcon->tid, ses->Suid, tcon->tree_name); spin_lock(&ses->chan_lock); if ((tcon->need_reconnect) || (CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses))) { @@ -2004,8 +2009,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) rc = cifs_send_recv(xid, ses, ses->server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc) + if (rc) { cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); + trace_smb3_tdis_err(xid, tcon->tid, ses->Suid, rc); + } + trace_smb3_tdis_done(xid, tcon->tid, ses->Suid); return rc; } @@ -2411,9 +2419,9 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) unsigned int acelen, acl_size, ace_count; unsigned int owner_offset = 0; unsigned int group_offset = 0; - struct smb3_acl acl; + struct smb3_acl acl = {}; - *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); + *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); if (set_owner) { /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */ @@ -2484,10 +2492,11 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */ acl.AclSize = cpu_to_le16(acl_size); acl.AceCount = cpu_to_le16(ace_count); + /* acl.Sbz1 and Sbz2 MBZ so are not set here, but initialized above */ memcpy(aclptr, &acl, sizeof(struct smb3_acl)); buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); - *len = roundup(ptr - (__u8 *)buf, 8); + *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8); return buf; } @@ -2581,7 +2590,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, * final path needs to be 8-byte aligned as specified in * MS-SMB2 2.2.13 SMB2 CREATE Request. */ - *out_size = roundup(*out_len * sizeof(__le16), 8); + *out_size = round_up(*out_len * sizeof(__le16), 8); *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL); if (!*out_path) return -ENOMEM; @@ -2674,7 +2683,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, - tcon->treeName, utf16_path); + tcon->tree_name, utf16_path); if (rc) goto err_free_req; @@ -2816,7 +2825,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, - tcon->treeName, path); + tcon->tree_name, path); if (rc) return rc; req->NameLength = cpu_to_le16(name_len * 2); @@ -2826,9 +2835,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; /* MUST set path len (NameLength) to 0 opening root of share */ req->NameLength = cpu_to_le16(uni_path_len - 2); - copy_size = uni_path_len; - if (copy_size % 8 != 0) - copy_size = roundup(copy_size, 8); + copy_size = round_up(uni_path_len, 8); copy_path = kzalloc(copy_size, GFP_KERNEL); if (!copy_path) return -ENOMEM; @@ -3011,7 +3018,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, oparms->create_options, oparms->desired_access, rc); if (rc == -EREMCHG) { pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); tcon->need_reconnect = true; } goto creat_exit; @@ -3472,7 +3479,7 @@ smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, if (rc) return rc; - memcpy(data, begin_of_buf, buffer_length); + memcpy(data, begin_of_buf, minbufsize); return 0; } @@ -3596,7 +3603,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp_iov, min_len, *data); + &rsp_iov, dlen ? *dlen : min_len, *data); if (rc && allocated) { kfree(*data); *data = NULL; @@ -3702,11 +3709,13 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, bool watch_tree, - u32 completion_filter) + u32 completion_filter, u32 max_out_data_len, char **out_data, + u32 *plen /* returned data len */) { struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = cifs_pick_channel(ses); struct smb_rqst rqst; + struct smb2_change_notify_rsp *smb_rsp; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; @@ -3722,6 +3731,9 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, memset(&rqst, 0, sizeof(struct smb_rqst)); memset(&iov, 0, sizeof(iov)); + if (plen) + *plen = 0; + rqst.rq_iov = iov; rqst.rq_nvec = 1; @@ -3740,9 +3752,28 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, cifs_stats_fail_inc(tcon, SMB2_CHANGE_NOTIFY_HE); trace_smb3_notify_err(xid, persistent_fid, tcon->tid, ses->Suid, (u8)watch_tree, completion_filter, rc); - } else + } else { trace_smb3_notify_done(xid, persistent_fid, tcon->tid, - ses->Suid, (u8)watch_tree, completion_filter); + ses->Suid, (u8)watch_tree, completion_filter); + /* validate that notify information is plausible */ + if ((rsp_iov.iov_base == NULL) || + (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp))) + goto cnotify_exit; + + smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; + + smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), &rsp_iov, + sizeof(struct file_notify_information)); + + *out_data = kmemdup((char *)smb_rsp + le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), GFP_KERNEL); + if (*out_data == NULL) { + rc = -ENOMEM; + goto cnotify_exit; + } else + *plen = le32_to_cpu(smb_rsp->OutputBufferLength); + } cnotify_exit: if (rqst.rq_iov) @@ -4090,7 +4121,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (request_type & CHAINED_REQUEST) { if (!(request_type & END_OF_CHAIN)) { /* next 8-byte aligned request */ - *total_len = DIV_ROUND_UP(*total_len, 8) * 8; + *total_len = ALIGN(*total_len, 8); shdr->NextCommand = cpu_to_le32(*total_len); } else /* END_OF_CHAIN */ shdr->NextCommand = 0; @@ -4429,7 +4460,7 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->bytes, wdata->result); if (wdata->result == -ENOSPC) pr_warn_once("Out of space writing to %s\n", - tcon->treeName); + tcon->tree_name); } else trace_smb3_write_done(0 /* no xid */, wdata->cfile->fid.persistent_fid, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f57881b8464f..1237bb86e93a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -56,6 +56,9 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL +#define SMB2_SYMLINK_STRUCT_SIZE \ + (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + #define SYMLINK_ERROR_TAG 0x4c4d5953 struct smb2_symlink_err_rsp { diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 3f740f24b96a..be21b5d26f67 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -53,16 +53,12 @@ extern bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv); extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); - -extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, - struct smb2_file_all_info *src); extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); -extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, FILE_ALL_INFO *data, - bool *adjust_tz, bool *symlink); +int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc); @@ -95,9 +91,9 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_read); -extern int smb2_open_file(const unsigned int xid, - struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf); +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path); +int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf); extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); @@ -148,7 +144,8 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, bool watch_tree, - u32 completion_filter); + u32 completion_filter, u32 max_out_data_len, + char **out_data, u32 *plen /* returned data len */); extern int __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, @@ -278,9 +275,9 @@ extern int smb2_query_info_compound(const unsigned int xid, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb); /* query path info from the server using SMB311 POSIX extensions*/ -extern int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *sb, const char *path, struct smb311_posix_qinfo *qinf, - bool *adjust_tx, bool *symlink); +int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 1a5fc3314dbf..381babc1212c 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -32,19 +32,17 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) struct cifs_secmech *p = &server->secmech; int rc; - rc = cifs_alloc_hash("hmac(sha256)", - &p->hmacsha256, - &p->sdeschmacsha256); + rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); if (rc) goto err; - rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); if (rc) goto err; return 0; err: - cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + cifs_free_hash(&p->hmacsha256); return rc; } @@ -54,25 +52,23 @@ smb311_crypto_shash_allocate(struct TCP_Server_Info *server) struct cifs_secmech *p = &server->secmech; int rc = 0; - rc = cifs_alloc_hash("hmac(sha256)", - &p->hmacsha256, - &p->sdeschmacsha256); + rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); if (rc) return rc; - rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); if (rc) goto err; - rc = cifs_alloc_hash("sha512", &p->sha512, &p->sdescsha512); + rc = cifs_alloc_hash("sha512", &p->sha512); if (rc) goto err; return 0; err: - cifs_free_hash(&p->cmacaes, &p->sdesccmacaes); - cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + cifs_free_hash(&p->aes_cmac); + cifs_free_hash(&p->hmacsha256); return rc; } @@ -81,18 +77,19 @@ static int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) { struct cifs_chan *chan; + struct TCP_Server_Info *pserver; struct cifs_ses *ses = NULL; - struct TCP_Server_Info *it = NULL; int i; int rc = 0; spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(it, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &it->smb_ses_list, smb_ses_list) { - if (ses->Suid == ses_id) - goto found; - } + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) + goto found; } cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n", __func__, ses_id); @@ -140,9 +137,13 @@ out: static struct cifs_ses * smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) { + struct TCP_Server_Info *pserver; struct cifs_ses *ses; - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { if (ses->Suid != ses_id) continue; ++ses->ses_count; @@ -219,34 +220,30 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; - struct shash_desc *shash; - struct crypto_shash *hash; - struct sdesc *sdesc = NULL; + struct shash_desc *shash = NULL; struct smb_rqst drqst; ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); - if (!ses) { + if (unlikely(!ses)) { cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); - return 0; + return -ENOENT; } memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); if (allocate_crypto) { - rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc); + rc = cifs_alloc_hash("hmac(sha256)", &shash); if (rc) { cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__); goto out; } - shash = &sdesc->shash; } else { - hash = server->secmech.hmacsha256; - shash = &server->secmech.sdeschmacsha256->shash; + shash = server->secmech.hmacsha256; } - rc = crypto_shash_setkey(hash, ses->auth_key.response, + rc = crypto_shash_setkey(shash->tfm, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_server_dbg(VFS, @@ -288,7 +285,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) - cifs_free_hash(&hash, &sdesc); + cifs_free_hash(&shash); if (ses) cifs_put_smb_ses(ses); return rc; @@ -315,42 +312,38 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, goto smb3signkey_ret; } - rc = crypto_shash_setkey(server->secmech.hmacsha256, + rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + rc = crypto_shash_init(server->secmech.hmacsha256); if (rc) { cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - i, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, i, 4); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - label.iov_base, label.iov_len); + rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - &zero, 1); + rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - context.iov_base, context.iov_len); + rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; @@ -358,19 +351,16 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L256, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4); } else { - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L128, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4); } if (rc) { cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, - hashptr); + rc = crypto_shash_final(server->secmech.hmacsha256, hashptr); if (rc) { cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); goto smb3signkey_ret; @@ -550,38 +540,35 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; - struct shash_desc *shash; - struct crypto_shash *hash; - struct sdesc *sdesc = NULL; + struct shash_desc *shash = NULL; struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); - if (rc) - return 0; + if (unlikely(rc)) { + cifs_server_dbg(VFS, "%s: Could not get signing key\n", __func__); + return rc; + } if (allocate_crypto) { - rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc); + rc = cifs_alloc_hash("cmac(aes)", &shash); if (rc) return rc; - - shash = &sdesc->shash; } else { - hash = server->secmech.cmacaes; - shash = &server->secmech.sdesccmacaes->shash; + shash = server->secmech.aes_cmac; } memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); - rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE); + rc = crypto_shash_setkey(shash->tfm, key, SMB2_CMACAES_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); goto out; } /* - * we already allocate sdesccmacaes when we init smb3 signing key, + * we already allocate aes_cmac when we init smb3 signing key, * so unlike smb2 case we do not have to check here if secmech are * initialized */ @@ -617,7 +604,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) - cifs_free_hash(&hash, &sdesc); + cifs_free_hash(&shash); return rc; } @@ -902,7 +889,7 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) { struct crypto_aead *tfm; - if (!server->secmech.ccmaesencrypt) { + if (!server->secmech.enc) { if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); @@ -913,23 +900,23 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) __func__); return PTR_ERR(tfm); } - server->secmech.ccmaesencrypt = tfm; + server->secmech.enc = tfm; } - if (!server->secmech.ccmaesdecrypt) { + if (!server->secmech.dec) { if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { - crypto_free_aead(server->secmech.ccmaesencrypt); - server->secmech.ccmaesencrypt = NULL; + crypto_free_aead(server->secmech.enc); + server->secmech.enc = NULL; cifs_server_dbg(VFS, "%s: Failed to alloc decrypt aead\n", __func__); return PTR_ERR(tfm); } - server->secmech.ccmaesdecrypt = tfm; + server->secmech.dec = tfm; } return 0; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5fbbec22bcc8..90789aaa6567 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -90,7 +90,7 @@ int smbd_max_send_size = 1364; int smbd_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ -int smbd_max_receive_size = 8192; +int smbd_max_receive_size = 1364; /* The timeout to initiate send of a keepalive message on idle */ int smbd_keep_alive_interval = 120; @@ -99,7 +99,7 @@ int smbd_keep_alive_interval = 120; * User configurable initial values for RDMA transport * The actual values used may be lower and are limited to hardware capabilities */ -/* Default maximum number of SGEs in a RDMA write/read */ +/* Default maximum number of pages in a single RDMA write/read */ int smbd_max_frmr_depth = 2048; /* If payload is less than this byte, use RDMA send/recv not read/write */ @@ -270,7 +270,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_request *request = container_of(wc->wr_cqe, struct smbd_request, cqe); - log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n", + log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n", request, wc->status); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { @@ -448,7 +448,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_connection *info = response->info; int data_length = 0; - log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%x\n", + log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", response, response->type, wc->status, wc->opcode, wc->byte_len, wc->pkey_index); @@ -723,7 +723,7 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n", + log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n", request->sge[0].addr, request->sge[0].length, request->sge[0].lkey); @@ -792,7 +792,7 @@ static int smbd_post_send(struct smbd_connection *info, for (i = 0; i < request->num_sge; i++) { log_rdma_send(INFO, - "rdma_request sge[%d] addr=%llu length=%u\n", + "rdma_request sge[%d] addr=0x%llx length=%u\n", i, request->sge[i].addr, request->sge[i].length); ib_dma_sync_single_for_device( info->id->device, @@ -1017,9 +1017,9 @@ static int smbd_post_send_data( { int i; u32 data_length = 0; - struct scatterlist sgl[SMBDIRECT_MAX_SGE]; + struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1]; - if (n_vec > SMBDIRECT_MAX_SGE) { + if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) { cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); return -EINVAL; } @@ -1079,7 +1079,7 @@ static int smbd_negotiate(struct smbd_connection *info) response->type = SMBD_NEGOTIATE_RESP; rc = smbd_post_recv(info, response); - log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x iov.lkey=%x\n", + log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); if (rc) @@ -1539,7 +1539,7 @@ static struct smbd_connection *_smbd_get_connection( if (smbd_send_credit_target > info->id->device->attrs.max_cqe || smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_send_credit_target, info->id->device->attrs.max_cqe, info->id->device->attrs.max_qp_wr); @@ -1548,7 +1548,7 @@ static struct smbd_connection *_smbd_get_connection( if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_receive_credit_max, info->id->device->attrs.max_cqe, info->id->device->attrs.max_qp_wr); @@ -1562,17 +1562,15 @@ static struct smbd_connection *_smbd_get_connection( info->max_receive_size = smbd_max_receive_size; info->keep_alive_interval = smbd_keep_alive_interval; - if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SGE) { + if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE || + info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) { log_rdma_event(ERR, - "warning: device max_send_sge = %d too small\n", - info->id->device->attrs.max_send_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); - } - if (info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_SGE) { - log_rdma_event(ERR, - "warning: device max_recv_sge = %d too small\n", + "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", + IB_DEVICE_NAME_MAX, + info->id->device->name, + info->id->device->attrs.max_send_sge, info->id->device->attrs.max_recv_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); + goto config_failed; } info->send_cq = NULL; @@ -1598,8 +1596,8 @@ static struct smbd_connection *_smbd_get_connection( qp_attr.qp_context = info; qp_attr.cap.max_send_wr = info->send_credit_target; qp_attr.cap.max_recv_wr = info->receive_credit_max; - qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE; - qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE; + qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE; + qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE; qp_attr.cap.max_inline_data = 0; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; @@ -1986,10 +1984,11 @@ int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst_array) { struct smbd_connection *info = server->smbd_conn; - struct kvec vec; + struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1]; int nvecs; int size; unsigned int buflen, remaining_data_length; + unsigned int offset, remaining_vec_data_length; int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); @@ -1998,10 +1997,8 @@ int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst; int rqst_idx; - if (info->transport_status != SMBD_CONNECTED) { - rc = -EAGAIN; - goto done; - } + if (info->transport_status != SMBD_CONNECTED) + return -EAGAIN; /* * Add in the page array if there is one. The caller needs to set @@ -2012,125 +2009,95 @@ int smbd_send(struct TCP_Server_Info *server, for (i = 0; i < num_rqst; i++) remaining_data_length += smb_rqst_len(server, &rqst_array[i]); - if (remaining_data_length > info->max_fragmented_send_size) { + if (unlikely(remaining_data_length > info->max_fragmented_send_size)) { + /* assertion: payload never exceeds negotiated maximum */ log_write(ERR, "payload size %d > max size %d\n", remaining_data_length, info->max_fragmented_send_size); - rc = -EINVAL; - goto done; + return -EINVAL; } log_write(INFO, "num_rqst=%d total length=%u\n", num_rqst, remaining_data_length); rqst_idx = 0; -next_rqst: - rqst = &rqst_array[rqst_idx]; - iov = rqst->rq_iov; - - cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", - rqst_idx, smb_rqst_len(server, rqst)); - for (i = 0; i < rqst->rq_nvec; i++) - dump_smb(iov[i].iov_base, iov[i].iov_len); - - - log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", - rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, smb_rqst_len(server, rqst)); - - start = i = 0; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen-iov[i].iov_len); - log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", - start, i, i - start, - remaining_data_length); - rc = smbd_post_send_data( - info, &iov[start], i-start, - remaining_data_length); - if (rc) - goto done; - } else { - /* iov[start] is too big, break it */ - nvecs = (buflen+max_iov_size-1)/max_iov_size; - log_write(INFO, "iov[%d] iov_base=%p buflen=%d break to %d vectors\n", - start, iov[start].iov_base, - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j*max_iov_size; - vec.iov_len = max_iov_size; - if (j == nvecs-1) - vec.iov_len = - buflen - - max_iov_size*(nvecs-1); - remaining_data_length -= vec.iov_len; - log_write(INFO, - "sending vec j=%d iov_base=%p iov_len=%zu remaining_data_length=%d\n", - j, vec.iov_base, vec.iov_len, - remaining_data_length); - rc = smbd_post_send_data( - info, &vec, 1, - remaining_data_length); - if (rc) - goto done; + do { + rqst = &rqst_array[rqst_idx]; + iov = rqst->rq_iov; + + cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", + rqst_idx, smb_rqst_len(server, rqst)); + remaining_vec_data_length = 0; + for (i = 0; i < rqst->rq_nvec; i++) { + remaining_vec_data_length += iov[i].iov_len; + dump_smb(iov[i].iov_base, iov[i].iov_len); + } + + log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", + rqst_idx, rqst->rq_nvec, + rqst->rq_npages, rqst->rq_pagesz, + rqst->rq_tailsz, smb_rqst_len(server, rqst)); + + start = 0; + offset = 0; + do { + buflen = 0; + i = start; + j = 0; + while (i < rqst->rq_nvec && + j < SMBDIRECT_MAX_SEND_SGE - 1 && + buflen < max_iov_size) { + + vecs[j].iov_base = iov[i].iov_base + offset; + if (buflen + iov[i].iov_len > max_iov_size) { + vecs[j].iov_len = + max_iov_size - iov[i].iov_len; + buflen = max_iov_size; + offset = vecs[j].iov_len; + } else { + vecs[j].iov_len = + iov[i].iov_len - offset; + buflen += vecs[j].iov_len; + offset = 0; + ++i; } - i++; - if (i == rqst->rq_nvec) - break; + ++j; } + + remaining_vec_data_length -= buflen; + remaining_data_length -= buflen; + log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n", + remaining_vec_data_length > 0 ? + "partial" : "complete", + rqst->rq_nvec, start, j, + remaining_data_length); + start = i; - buflen = 0; - } else { - i++; - if (i == rqst->rq_nvec) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", - start, i, i - start, + rc = smbd_post_send_data(info, vecs, j, remaining_data_length); + if (rc) + goto done; + } while (remaining_vec_data_length > 0); + + /* now sending pages if there are any */ + for (i = 0; i < rqst->rq_npages; i++) { + rqst_page_get_length(rqst, i, &buflen, &offset); + nvecs = (buflen + max_iov_size - 1) / max_iov_size; + log_write(INFO, "sending pages buflen=%d nvecs=%d\n", + buflen, nvecs); + for (j = 0; j < nvecs; j++) { + size = min_t(unsigned int, max_iov_size, remaining_data_length); + remaining_data_length -= size; + log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", + i, j * max_iov_size + offset, size, remaining_data_length); - rc = smbd_post_send_data(info, &iov[start], - i-start, remaining_data_length); + rc = smbd_post_send_page( + info, rqst->rq_pages[i], + j*max_iov_size + offset, + size, remaining_data_length); if (rc) goto done; - break; } } - log_write(INFO, "looping i=%d buflen=%d\n", i, buflen); - } - - /* now sending pages if there are any */ - for (i = 0; i < rqst->rq_npages; i++) { - unsigned int offset; - - rqst_page_get_length(rqst, i, &buflen, &offset); - nvecs = (buflen + max_iov_size - 1) / max_iov_size; - log_write(INFO, "sending pages buflen=%d nvecs=%d\n", - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - size = max_iov_size; - if (j == nvecs-1) - size = buflen - j*max_iov_size; - remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", - i, j * max_iov_size + offset, size, - remaining_data_length); - rc = smbd_post_send_page( - info, rqst->rq_pages[i], - j*max_iov_size + offset, - size, remaining_data_length); - if (rc) - goto done; - } - } - - rqst_idx++; - if (rqst_idx < num_rqst) - goto next_rqst; + } while (++rqst_idx < num_rqst); done: /* diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index a87fca82a796..207ef979cd51 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -91,7 +91,7 @@ struct smbd_connection { /* Memory registrations */ /* Maximum number of RDMA read/write outstanding on this connection */ int responder_resources; - /* Maximum number of SGEs in a RDMA write/read */ + /* Maximum number of pages in a single RDMA write/read on this connection */ int max_frmr_depth; /* * If payload is less than or equal to the threshold, @@ -225,21 +225,25 @@ struct smbd_buffer_descriptor_v1 { __le32 length; } __packed; -/* Default maximum number of SGEs in a RDMA send/recv */ -#define SMBDIRECT_MAX_SGE 16 +/* Maximum number of SGEs used by smbdirect.c in any send work request */ +#define SMBDIRECT_MAX_SEND_SGE 6 + /* The context for a SMBD request */ struct smbd_request { struct smbd_connection *info; struct ib_cqe cqe; - /* the SGE entries for this packet */ - struct ib_sge sge[SMBDIRECT_MAX_SGE]; + /* the SGE entries for this work request */ + struct ib_sge sge[SMBDIRECT_MAX_SEND_SGE]; int num_sge; /* SMBD packet header follows this structure */ u8 packet[]; }; +/* Maximum number of SGEs used by smbdirect.c in any receive work request */ +#define SMBDIRECT_MAX_RECV_SGE 1 + /* The context for a SMBD response */ struct smbd_response { struct smbd_connection *info; diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 6b88dc2e364f..110070ba8b04 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -372,6 +372,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, @@ -409,6 +410,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); DECLARE_EVENT_CLASS(smb3_inf_compound_err_class, @@ -451,6 +453,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); /* * For logging SMB3 Status code and Command for responses which return errors diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 9a2753e21170..575fa8f58342 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -753,8 +753,9 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_freezekillable_unsafe(server->response_q, - midQ->mid_state != MID_REQUEST_SUBMITTED); + error = wait_event_state(server->response_q, + midQ->mid_state != MID_REQUEST_SUBMITTED, + (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE)); if (error < 0) return -ERESTARTSYS; diff --git a/fs/coredump.c b/fs/coredump.c index 3538f3a63965..7bad7785e8e6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -354,7 +354,7 @@ static int zap_process(struct task_struct *start, int exit_code) struct task_struct *t; int nr = 0; - /* ignore all signals except SIGKILL, see prepare_signal() */ + /* Allow SIGKILL, see prepare_signal() */ start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -402,9 +402,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state) if (core_waiters > 0) { struct core_thread *ptr; - freezer_do_not_count(); - wait_for_completion(&core_state->startup); - freezer_count(); + wait_for_completion_state(&core_state->startup, + TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); /* * Wait for all the threads to become inactive, so that * all the thread context (extended register state, like @@ -412,7 +411,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state) */ ptr = core_state->dumper.next; while (ptr != NULL) { - wait_task_inactive(ptr->task, 0); + wait_task_inactive(ptr->task, TASK_ANY); ptr = ptr->next; } } @@ -1101,30 +1100,20 @@ whole: return vma->vm_end - vma->vm_start; } -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} - /* * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, +static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, + struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) + if (gate_vma && (vma == gate_vma)) return NULL; + + vma = mas_next(mas, ULONG_MAX); + if (vma) + return vma; return gate_vma; } @@ -1148,9 +1137,10 @@ static void free_vma_snapshot(struct coredump_params *cprm) */ static bool dump_vma_snapshot(struct coredump_params *cprm) { - struct vm_area_struct *vma, *gate_vma; + struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - int i; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int i = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1170,8 +1160,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma), i++) { + while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; @@ -1179,10 +1168,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); m->pgoff = vma->vm_pgoff; - m->file = vma->vm_file; if (m->file) get_file(m->file); + i++; } mmap_write_unlock(mm); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 1cca09aa43f8..2a24b1f0ae68 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -205,14 +205,19 @@ static int allocate_filesystem_keyring(struct super_block *sb) } /* - * This is called at unmount time to release all encryption keys that have been - * added to the filesystem, along with the keyring that contains them. + * Release all encryption keys that have been added to the filesystem, along + * with the keyring that contains them. * - * Note that besides clearing and freeing memory, this might need to evict keys - * from the keyslots of an inline crypto engine. Therefore, this must be called - * while the filesystem's underlying block device(s) are still available. + * This is called at unmount time. The filesystem's underlying block device(s) + * are still available at this time; this is important because after user file + * accesses have been allowed, this function may need to evict keys from the + * keyslots of an inline crypto engine, which requires the block device(s). + * + * This is also called when the super_block is being freed. This is needed to + * avoid a memory leak if mounting fails after the "test_dummy_encryption" + * option was processed, as in that case the unmount-time call isn't made. */ -void fscrypt_sb_delete(struct super_block *sb) +void fscrypt_destroy_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring = sb->s_master_keys; size_t i; diff --git a/fs/dcache.c b/fs/dcache.c index bb0c4d0038db..52e6d5fdab6b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - /* - * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT - * kernels spin_lock() implicitly disables preemption, but not on - * PREEMPT_RT. So for RT it has to be done explicitly to protect - * the sequence count write side critical section against a reader - * or another writer preempting, which would result in a live lock. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); wake_up_all(d_wait); } @@ -3258,8 +3249,10 @@ void d_genocide(struct dentry *parent) EXPORT_SYMBOL(d_genocide); -void d_tmpfile(struct dentry *dentry, struct inode *inode) +void d_tmpfile(struct file *file, struct inode *inode) { + struct dentry *dentry = file->f_path.dentry; + inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 950c63fa4d0b..ddb3fc258df9 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1121,7 +1121,7 @@ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, } EXPORT_SYMBOL_GPL(debugfs_print_regs32); -static int debugfs_show_regset32(struct seq_file *s, void *data) +static int debugfs_regset32_show(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; @@ -1136,17 +1136,7 @@ static int debugfs_show_regset32(struct seq_file *s, void *data) return 0; } -static int debugfs_open_regset32(struct inode *inode, struct file *file) -{ - return single_open(file, debugfs_show_regset32, inode->i_private); -} - -static const struct file_operations fops_regset32 = { - .open = debugfs_open_regset32, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(debugfs_regset32); /** * debugfs_create_regset32 - create a debugfs file that returns register values @@ -1167,7 +1157,7 @@ void debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, struct debugfs_regset32 *regset) { - debugfs_create_file(name, mode, parent, regset, &fops_regset32); + debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 232cfdf095ae..2e8e112b1993 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -82,6 +82,8 @@ struct debugfs_mount_opts { kuid_t uid; kgid_t gid; umode_t mode; + /* Opt_* bitfield. */ + unsigned int opts; }; enum { @@ -111,6 +113,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) kgid_t gid; char *p; + opts->opts = 0; opts->mode = DEBUGFS_DEFAULT_MODE; while ((p = strsep(&data, ",")) != NULL) { @@ -145,24 +148,44 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) * but traditionally debugfs has ignored all mount options */ } + + opts->opts |= BIT(token); } return 0; } -static int debugfs_apply_options(struct super_block *sb) +static void _debugfs_apply_options(struct super_block *sb, bool remount) { struct debugfs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct debugfs_mount_opts *opts = &fsi->mount_opts; - inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= opts->mode; + /* + * On remount, only reset mode/uid/gid if they were provided as mount + * options. + */ - inode->i_uid = opts->uid; - inode->i_gid = opts->gid; + if (!remount || opts->opts & BIT(Opt_mode)) { + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + } - return 0; + if (!remount || opts->opts & BIT(Opt_uid)) + inode->i_uid = opts->uid; + + if (!remount || opts->opts & BIT(Opt_gid)) + inode->i_gid = opts->gid; +} + +static void debugfs_apply_options(struct super_block *sb) +{ + _debugfs_apply_options(sb, false); +} + +static void debugfs_apply_options_remount(struct super_block *sb) +{ + _debugfs_apply_options(sb, true); } static int debugfs_remount(struct super_block *sb, int *flags, char *data) @@ -175,7 +198,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data) if (err) goto fail; - debugfs_apply_options(sb); + debugfs_apply_options_remount(sb); fail: return err; diff --git a/fs/direct-io.c b/fs/direct-io.c index f669163d5860..03d381377ae1 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -421,8 +421,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index a0ef63cfcecb..9e4f47808bd5 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -651,22 +651,6 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, if (err) return err; - /* - * Ensure that the available space hasn't shrunk below the safe level - */ - status = check_var_size(attributes, *size + ucs2_strsize(name, 1024)); - if (status != EFI_SUCCESS) { - if (status != EFI_UNSUPPORTED) { - err = efi_status_to_err(status); - goto out; - } - - if (*size > 65536) { - err = -ENOSPC; - goto out; - } - } - status = efivar_set_variable_locked(name, vendor, attributes, *size, data, false); if (status != EFI_SUCCESS) { diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 998cd26a1b3b..fe05bc51f9f2 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -590,14 +590,17 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, struct super_block *psb = erofs_pseudo_mnt->mnt_sb; mutex_lock(&erofs_domain_cookies_lock); + spin_lock(&psb->s_inode_list_lock); list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { ctx = inode->i_private; if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) continue; igrab(inode); + spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } + spin_unlock(&psb->s_inode_list_lock); ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode); mutex_unlock(&erofs_domain_cookies_lock); return ctx; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index cce56dde135c..c7f24fc7efd5 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -7,6 +7,7 @@ #include "zdata.h" #include "compress.h" #include <linux/prefetch.h> +#include <linux/psi.h> #include <trace/events/erofs.h> @@ -812,15 +813,14 @@ retry: ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; - - if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && - !(map->m_flags & EROFS_MAP_PARTIAL_REF) && - fe->pcl->length == map->m_llen) - fe->pcl->partial = false; if (fe->pcl->length < offset + end - map->m_la) { fe->pcl->length = offset + end - map->m_la; fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; } + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; next_part: /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; @@ -887,15 +887,13 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { unsigned int pgnr; - struct page *oldpage; pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pgnr >= be->nr_pages); - oldpage = be->decompressed_pages[pgnr]; - be->decompressed_pages[pgnr] = bvec->page; - - if (!oldpage) + if (!be->decompressed_pages[pgnr]) { + be->decompressed_pages[pgnr] = bvec->page; return; + } } /* (cold path) one pcluster is requested multiple times */ @@ -1414,6 +1412,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; + /* initialize to 1 to make skip psi_memstall_leave unless needed */ + unsigned long pflags = 1; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; @@ -1463,10 +1463,15 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, if (bio && (cur != last_index + 1 || last_bdev != mdev.m_bdev)) { submit_bio_retry: + if (!pflags) + psi_memstall_leave(&pflags); submit_bio(bio); bio = NULL; } + if (unlikely(PageWorkingset(page))) + psi_memstall_enter(&pflags); + if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); @@ -1494,8 +1499,11 @@ submit_bio_retry: move_to_bypass_jobqueue(pcl, qtail, owned_head); } while (owned_head != Z_EROFS_PCLUSTER_TAIL); - if (bio) + if (bio) { + if (!pflags) + psi_memstall_leave(&pflags); submit_bio(bio); + } /* * although background is preferred, no one is pending for submission. diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index e7f04c4fbb81..d98c95212985 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -126,10 +126,10 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) } /* - * bit 31: I/O error occurred on this page - * bit 0 - 30: remaining parts to complete this page + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page */ -#define Z_EROFS_PAGE_EIO (1 << 31) +#define Z_EROFS_PAGE_EIO (1 << 30) static inline void z_erofs_onlinepage_init(struct page *page) { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 44c27ef39c43..0bb66927e3d0 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -57,8 +57,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), - EROFS_KMAP_ATOMIC); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out_unlock; @@ -73,7 +72,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); vi->z_tailextent_headlcn = 0; - goto unmap_done; + goto done; } vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; @@ -85,7 +84,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; - goto unmap_done; + goto out_put_metabuf; } vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); @@ -95,7 +94,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto unmap_done; + goto out_put_metabuf; } if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ @@ -103,12 +102,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto unmap_done; + goto out_put_metabuf; } -unmap_done: - erofs_put_metabuf(&buf); - if (err) - goto out_unlock; if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { struct erofs_map_blocks map = { @@ -127,7 +122,7 @@ unmap_done: err = -EFSCORRUPTED; } if (err < 0) - goto out_unlock; + goto out_put_metabuf; } if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && @@ -141,11 +136,14 @@ unmap_done: EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (err < 0) - goto out_unlock; + goto out_put_metabuf; } +done: /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_put_metabuf: + erofs_put_metabuf(&buf); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); return err; diff --git a/fs/eventfd.c b/fs/eventfd.c index 3627dd7d25db..c0ffee99ad23 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -69,17 +69,17 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ - if (WARN_ON_ONCE(current->in_eventfd_signal)) + if (WARN_ON_ONCE(current->in_eventfd)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); - current->in_eventfd_signal = 1; + current->in_eventfd = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); - current->in_eventfd_signal = 0; + current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; @@ -253,8 +253,10 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) __set_current_state(TASK_RUNNING); } eventfd_ctx_do_read(ctx, &ucnt); + current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); + current->in_eventfd = 0; spin_unlock_irq(&ctx->wqh.lock); if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) return -EFAULT; @@ -301,8 +303,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c } if (likely(res > 0)) { ctx->count += ucnt; + current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); + current->in_eventfd = 0; } spin_unlock_irq(&ctx->wqh.lock); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8b56b94e2f56..52954d4637b5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1065,7 +1065,7 @@ static inline bool list_add_tail_lockless(struct list_head *new, * added to the list from another CPU: the winner observes * new->next == new. */ - if (cmpxchg(&new->next, new, head) != new) + if (!try_cmpxchg(&new->next, &new, head)) return false; /* diff --git a/fs/exec.c b/fs/exec.c index 768843477a49..a0b1f0337a62 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -28,7 +28,6 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> -#include <linux/vmacache.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> @@ -683,6 +682,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + struct vm_area_struct *next; struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -691,7 +692,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * ensure there are no vmas between where we want to go * and where we are */ - if (vma != find_vma(mm, new_start)) + if (vma != vma_next(&vmi)) return -EFAULT; /* @@ -710,12 +711,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) lru_add_drain(); tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -724,7 +726,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); @@ -1022,9 +1024,9 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; - vmacache_flush(tsk); + lru_gen_add_mm(mm); task_unlock(tsk); + lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); @@ -1195,11 +1197,11 @@ static int unshare_sighand(struct task_struct *me) return -ENOMEM; refcount_set(&newsighand->count, 1); - memcpy(newsighand->action, oldsighand->action, - sizeof(newsighand->action)); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); + memcpy(newsighand->action, oldsighand->action, + sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); @@ -1879,7 +1881,7 @@ static int do_execveat_common(int fd, struct filename *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { + is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index a795437b86d0..5590a1e83126 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -552,7 +552,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if (info->attr & ATTR_SUBDIR) { /* directory */ inode->i_generation &= ~1; diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index c17ccc19b938..5dc0a31f4a08 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -126,6 +126,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) struct ext2_group_desc * desc; struct buffer_head * bh = NULL; ext2_fsblk_t bitmap_blk; + int ret; desc = ext2_get_group_desc(sb, block_group, NULL); if (!desc) @@ -139,10 +140,10 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) block_group, le32_to_cpu(desc->bg_block_bitmap)); return NULL; } - if (likely(bh_uptodate_or_lock(bh))) + ret = bh_read(bh, 0); + if (ret > 0) return bh; - - if (bh_submit_read(bh) < 0) { + if (ret < 0) { brelse(bh); ext2_error(sb, __func__, "Cannot read block bitmap - " diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 998dd2ac8008..f4944c4dee60 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -277,8 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - group = prandom_u32(); - parent_group = (unsigned)group % ngroups; + parent_group = prandom_u32_max(ngroups); for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext2_get_group_desc (sb, group, NULL); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 5fd9a22d2b70..9125eab85146 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -120,7 +120,7 @@ static int ext2_create (struct user_namespace * mnt_userns, } static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode = ext2_new_inode(dir, mode, NULL); if (IS_ERR(inode)) @@ -128,9 +128,9 @@ static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, ext2_set_file_ops(inode); mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); - return 0; + return finish_open_simple(file, 0); } static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 252c742379cf..03f2af98b1b4 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -163,7 +163,7 @@ static void ext2_put_super (struct super_block * sb) db_count = sbi->s_gdb_count; for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + kvfree(sbi->s_group_desc); kfree(sbi->s_debts); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -1052,6 +1052,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group); goto failed_mount; } + /* At least inode table, bitmaps, and sb have to fit in one group */ + if (sbi->s_blocks_per_group <= sbi->s_itb_per_group + 3) { + ext2_msg(sb, KERN_ERR, + "error: #blocks per group smaller than metadata size: %lu <= %lu", + sbi->s_blocks_per_group, sbi->s_inodes_per_group + 3); + goto failed_mount; + } if (sbi->s_frags_per_group > sb->s_blocksize * 8) { ext2_msg(sb, KERN_ERR, "error: #fragments per group too big: %lu", @@ -1065,9 +1072,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_inodes_per_group); goto failed_mount; } + if (sb_bdev_nr_blocks(sb) < le32_to_cpu(es->s_blocks_count)) { + ext2_msg(sb, KERN_ERR, + "bad geometry: block count %u exceeds size of device (%u blocks)", + le32_to_cpu(es->s_blocks_count), + (unsigned)sb_bdev_nr_blocks(sb)); + goto failed_mount; + } - if (EXT2_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext2; sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; @@ -1080,7 +1092,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc_array(db_count, + sbi->s_group_desc = kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { @@ -1206,7 +1218,7 @@ failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); failed_mount_group_desc: - kfree(sbi->s_group_desc); + kvfree(sbi->s_group_desc); kfree(sbi->s_debts); failed_mount: brelse(bh); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 208b87ce8858..e9bc46684106 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -463,10 +463,9 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, hinfo.hash_version = DX_HASH_HALF_MD4; hinfo.seed = sbi->s_hash_seed; ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); - grp = hinfo.hash; + parent_group = hinfo.hash % ngroups; } else - grp = prandom_u32(); - parent_group = (unsigned)grp % ngroups; + parent_group = prandom_u32_max(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); @@ -1280,7 +1279,7 @@ got: EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 790d5ffe8559..95dfea28bf4e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -453,8 +453,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_ctime = inode_bl->i_ctime = current_time(inode); inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); - inode_bl->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); + inode_bl->i_generation = get_random_u32(); ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 9af68a7ecdcf..588cb09c5291 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -265,7 +265,7 @@ static unsigned int mmp_new_seq(void) u32 new_seq; do { - new_seq = prandom_u32(); + new_seq = get_random_u32(); } while (new_seq > EXT4_MMP_SEQ_MAX); return new_seq; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index be8136aafa22..c08c0aba1883 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2862,7 +2862,7 @@ retry: } static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2884,7 +2884,7 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_unlock_inode; @@ -2895,7 +2895,7 @@ retry: ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return finish_open_simple(file, err); err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6ddebc9f1b90..7cdd2138c897 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1741,10 +1741,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static const char deprecated_msg[] = - "Mount option \"%s\" will be removed by %s\n" - "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; - #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 @@ -3782,8 +3778,7 @@ cont_thread: } if (!progress) { elr->lr_next_sched = jiffies + - (prandom_u32() - % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; @@ -3930,8 +3925,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, * spread the inode table initialization requests * better. */ - elr->lr_next_sched = jiffies + (prandom_u32() % - (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + elr->lr_next_sched = jiffies + prandom_u32_max( + EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 20cadfb740dc..3c640bd7ecae 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -363,13 +363,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index eaa240b21f07..5bbc44a5216e 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -219,7 +219,7 @@ static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8259e0fa97e1..0c82dae082aa 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,12 +26,16 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); - if (!end_io) + if (!end_io) { f2fs_flush_merged_writes(sbi); + + f2fs_handle_stop(sbi, reason); + } } /* @@ -89,7 +93,7 @@ repeat: return ERR_PTR(err); } - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -122,7 +126,7 @@ retry: if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } return page; } @@ -140,7 +144,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int segno, offset; bool exist; - if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ) + if (type == DATA_GENERIC) return true; segno = GET_SEGNO(sbi, blkaddr); @@ -148,6 +152,13 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); + if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return exist; + } + if (!exist && type == DATA_GENERIC_ENHANCE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); @@ -185,6 +196,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case DATA_GENERIC: case DATA_GENERIC_ENHANCE: case DATA_GENERIC_ENHANCE_READ: + case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { f2fs_warn(sbi, "access invalid blkaddr:%u", @@ -276,7 +288,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, f2fs_put_page(page, err ? 1 : 0); if (!err) - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, + F2FS_BLKSIZE); } out: blk_finish_plug(&plug); @@ -448,8 +461,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); set_page_private_reference(&folio->page); return true; @@ -1053,7 +1065,8 @@ void f2fs_remove_dirty_inode(struct inode *inode) spin_unlock(&sbi->inode_lock[type]); } -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp) { struct list_head *head; struct inode *inode; @@ -1088,11 +1101,15 @@ retry: if (inode) { unsigned long cur_ino = inode->i_ino; - F2FS_I(inode)->cp_task = current; + if (from_cp) + F2FS_I(inode)->cp_task = current; + F2FS_I(inode)->wb_task = current; filemap_fdatawrite(inode->i_mapping); - F2FS_I(inode)->cp_task = NULL; + F2FS_I(inode)->wb_task = NULL; + if (from_cp) + F2FS_I(inode)->cp_task = NULL; iput(inode); /* We need to give cpu to another writers. */ @@ -1221,7 +1238,7 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); if (err) return err; cond_resched(); @@ -1892,15 +1909,27 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; + struct task_struct *ckpt_task; + + if (!cprc->f2fs_issue_ckpt) + return; - if (cprc->f2fs_issue_ckpt) { - struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + ckpt_task = cprc->f2fs_issue_ckpt; + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); - cprc->f2fs_issue_ckpt = NULL; - kthread_stop(ckpt_task); + f2fs_flush_ckpt_thread(sbi); +} - flush_remained_ckpt_reqs(sbi, NULL); - } +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + flush_remained_ckpt_reqs(sbi, NULL); + + /* Let's wait for the previous dispatched checkpoint. */ + while (atomic_read(&cprc->queued_ckpt)) + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 70e97075e535..d315c2de136f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -762,6 +762,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } @@ -912,17 +913,15 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) reason = "[C|*|C|*]"; goto out; } - if (compressed) { - if (!__is_valid_data_blkaddr(blkaddr)) { - if (!cluster_end) - cluster_end = i; - continue; - } - /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ - if (cluster_end) { - reason = "[C|N|N|V]"; - goto out; - } + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; } } return false; @@ -952,6 +951,7 @@ static int __f2fs_cluster_blocks(struct inode *inode, if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); goto fail; } @@ -1568,12 +1568,8 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, if (!dic->cbuf) return -ENOMEM; - if (cops->init_decompress_ctx) { - int ret = cops->init_decompress_ctx(dic); - - if (ret) - return ret; - } + if (cops->init_decompress_ctx) + return cops->init_decompress_ctx(dic); return 0; } @@ -1905,7 +1901,7 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { - struct address_space *mapping = sbi->compress_inode->i_mapping; + struct address_space *mapping = COMPRESS_MAPPING(sbi); struct folio_batch fbatch; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 93cc2ec51c2a..a71e818cd67b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -335,7 +335,8 @@ static void f2fs_write_end_io(struct bio *bio) mempool_free(page, sbi->write_io_dummy); if (unlikely(bio->bi_status)) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); continue; } @@ -351,7 +352,8 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && @@ -705,8 +707,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? - META_GENERIC : DATA_GENERIC_ENHANCE))) + META_GENERIC : DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -725,7 +729,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page): WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page)); __submit_bio(fio->sbi, bio, fio->type); return 0; @@ -906,8 +910,10 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, - __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -1085,7 +1091,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, } ClearPageError(page); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); __submit_bio(sbi, bio, DATA); return 0; } @@ -1217,6 +1223,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } goto got_it; @@ -1237,6 +1245,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } got_it: @@ -1550,6 +1560,7 @@ next_block: if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } @@ -1595,6 +1606,8 @@ next_block: (flag != F2FS_GET_BLOCK_FIEMAP || IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); goto sync_out; } if (flag == F2FS_GET_BLOCK_BMAP) { @@ -1818,7 +1831,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); - if (err || err == 1) + if (err) return err; } @@ -2076,6 +2089,8 @@ got_it: if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto out; } } else { @@ -2124,7 +2139,8 @@ submit_and_realloc: goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); - f2fs_update_iostat(F2FS_I_SB(inode), FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); ClearPageError(page); *last_block_in_bio = block_nr; goto out; @@ -2272,8 +2288,7 @@ submit_and_realloc: refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_CDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); ClearPageError(page); *last_block_in_bio = blkaddr; } @@ -2545,7 +2560,7 @@ bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) + if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); @@ -2619,8 +2634,11 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, - DATA_GENERIC_ENHANCE)) + DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(fio->sbi, + ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } ipu_force = true; fio->need_lock = LOCK_DONE; @@ -2648,6 +2666,7 @@ got_it: !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); goto out_writepage; } @@ -2858,7 +2877,7 @@ out: } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task && allow_balance) + !F2FS_I(inode)->wb_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -3158,7 +3177,7 @@ static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { /* to avoid deadlock in path of data flush */ - if (F2FS_I(inode)->cp_task) + if (F2FS_I(inode)->wb_task) return false; if (!S_ISREG(inode->i_mode)) @@ -3561,6 +3580,7 @@ repeat: if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); @@ -3699,8 +3719,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { f2fs_update_dirty_folio(inode, folio); return true; } @@ -3972,6 +3991,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret < 0) return ret; + stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; @@ -3981,6 +4001,7 @@ static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); + stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c01471573977..a216dcdf6941 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,7 +91,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->aw_cnt = sbi->atomic_files; + si->aw_cnt = atomic_read(&sbi->atomic_files); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); @@ -135,6 +135,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); + si->swapfile_inode = atomic_read(&sbi->swapfile_inode); si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; @@ -347,7 +348,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_readonly(si->sbi->sb) ? "RO" : "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); if (si->sbi->s_flag) { @@ -385,6 +386,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); + seq_printf(s, " - Swapfile Inode: %u\n", + si->swapfile_inode); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", @@ -607,6 +610,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); atomic64_set(&sbi->compr_blocks, 0); + atomic_set(&sbi->swapfile_inode, 0); + atomic_set(&sbi->atomic_files, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d5bd7932fb64..21960a899b6a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1041,6 +1041,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 866e72b29bd5..932c070173b9 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -544,7 +544,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0); write_lock(&et->lock); @@ -583,7 +583,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, org_end = dei.fofs + dei.len; f2fs_bug_on(sbi, pos >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { en->ei.len = pos - en->ei.fofs; prev_en = en; parts = 1; @@ -675,7 +675,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen); + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) @@ -804,9 +804,8 @@ void f2fs_drop_extent_tree(struct inode *inode) if (!f2fs_may_extent_tree(inode)) return; - set_inode_flag(inode, FI_NO_EXTENT); - write_lock(&et->lock); + set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); if (et->largest.len) { et->largest.len = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index aea816a133a8..e6355a5683b7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -266,6 +266,10 @@ enum { * condition of read on truncated area * by extent_cache */ + DATA_GENERIC_ENHANCE_UPDATE, /* + * strong check on range and segment + * bitmap for update case + */ META_GENERIC, }; @@ -274,7 +278,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ - TRANS_DIR_INO, /* for trasactions dir ino list */ + TRANS_DIR_INO, /* for transactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; @@ -782,6 +786,7 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ struct task_struct *cp_task; /* separate cp/wb IO stats*/ + struct task_struct *wb_task; /* indicate inode is in context of writeback */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ spinlock_t i_size_lock; /* protect last_disk_size */ @@ -1158,7 +1163,10 @@ enum iostat_type { APP_BUFFERED_IO, /* app buffered write IOs */ APP_WRITE_IO, /* app write IOs */ APP_MAPPED_IO, /* app mapped IOs */ + APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ + APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ FS_META_IO, /* meta IOs from kworker/reclaimer */ FS_GC_DATA_IO, /* data IOs from forground gc */ @@ -1172,6 +1180,8 @@ enum iostat_type { APP_BUFFERED_READ_IO, /* app buffered read IOs */ APP_READ_IO, /* app read IOs */ APP_MAPPED_READ_IO, /* app mapped read IOs */ + APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ + APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ FS_DATA_READ_IO, /* data read IOs */ FS_GDATA_READ_IO, /* data read IOs from background gc */ FS_CDATA_READ_IO, /* compressed data read IOs */ @@ -1247,7 +1257,6 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ - ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -1726,11 +1735,9 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ spinlock_t gc_urgent_high_lock; - bool gc_urgent_high_limited; /* indicates having limited trial count */ unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ - unsigned int atomic_files; /* # of opened atomic file */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -1761,6 +1768,8 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ + atomic_t swapfile_inode; /* # of swapfile inodes */ + atomic_t atomic_files; /* # of opened atomic file */ atomic_t max_aw_cnt; /* max # of atomic writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ @@ -1806,6 +1815,10 @@ struct f2fs_sb_info { struct workqueue_struct *post_read_wq; /* post read workqueue */ + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + spinlock_t error_lock; /* protect errors array */ + bool error_dirty; /* errors of sb is dirty */ + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ @@ -2525,7 +2538,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) - return &ckpt->sit_nat_version_bitmap; + return tmp_ptr; else return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { @@ -3547,6 +3560,8 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); @@ -3706,7 +3721,9 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason); +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); @@ -3736,7 +3753,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); void f2fs_remove_dirty_inode(struct inode *inode); -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3858,7 +3876,7 @@ struct f2fs_stat_info { int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode; + int compr_inode, swapfile_inode; unsigned long long compr_blocks; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; @@ -3947,6 +3965,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_inc_swapfile_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_dec_swapfile_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_inc_atomic_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) +#define stat_dec_atomic_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3966,7 +3992,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_inc(&(sbi)->inplace_count)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = F2FS_I_SB(inode)->atomic_files; \ + int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ @@ -4031,6 +4057,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) +#define stat_inc_swapfile_inode(inode) do { } while (0) +#define stat_dec_swapfile_inode(inode) do { } while (0) +#define stat_inc_atomic_inode(inode) do { } while (0) +#define stat_dec_atomic_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 791770507328..82cda1258227 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -43,8 +43,8 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) ret = filemap_fault(vmf); if (!ret) - f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, - F2FS_BLKSIZE); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_MAPPED_READ_IO, F2FS_BLKSIZE); trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret); @@ -154,7 +154,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (!PageUptodate(page)) SetPageUptodate(page); - f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); trace_f2fs_vm_page_mkwrite(page, DATA); @@ -822,7 +822,12 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw) /* disallow direct IO if any of devices has unaligned blksize */ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) return true; - + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) + return true; if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi)) return true; if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) @@ -912,9 +917,10 @@ static void __setattr_copy(struct user_namespace *mnt_userns, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(vfsgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -1196,6 +1202,7 @@ next_dnode: !f2fs_is_valid_blkaddr(sbi, *blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -1480,6 +1487,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); break; } @@ -2089,9 +2097,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) } f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files++; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_COW_FILE); @@ -2185,7 +2191,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) { if (ret == -EROFS) { ret = 0; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); trace_f2fs_shutdown(sbi, in, ret); } @@ -2198,7 +2205,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = freeze_bdev(sb->s_bdev); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev); break; @@ -2207,16 +2214,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = f2fs_sync_fs(sb, 1); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: @@ -3362,8 +3369,10 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3524,8 +3533,10 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3797,6 +3808,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } @@ -4253,7 +4266,7 @@ static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_READ); if (error) return error; - f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size); + f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size); return 0; } @@ -4342,7 +4355,8 @@ skip_read_trace: } else { ret = filemap_read(iocb, to, 0); if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); } if (trace_f2fs_dataread_end_enabled()) trace_f2fs_dataread_end(inode, pos, ret); @@ -4459,7 +4473,8 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, if (ret > 0) { iocb->ki_pos += ret; - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_IO, ret); } return ret; } @@ -4472,7 +4487,7 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_WRITE); if (error) return error; - f2fs_update_iostat(sbi, APP_DIRECT_IO, size); + f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); return 0; } @@ -4660,7 +4675,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) skip_write_trace: /* Do the actual write. */ ret = dio ? - f2fs_dio_write_iter(iocb, from, &may_need_sync): + f2fs_dio_write_iter(iocb, from, &may_need_sync) : f2fs_buffered_write_iter(iocb, from); if (trace_f2fs_datawrite_end_enabled()) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6da21d405ce1..4546e01b2ee0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -74,7 +74,8 @@ static int gc_thread_func(void *data) if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FAULT_INJECT); } if (!sb_start_write_trylock(sbi->sb)) { @@ -97,14 +98,10 @@ static int gc_thread_func(void *data) */ if (sbi->gc_mode == GC_URGENT_HIGH) { spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_limited) { - if (!sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_limited = false; - spin_unlock(&sbi->gc_urgent_high_lock); - sbi->gc_mode = GC_NORMAL; - continue; - } + if (sbi->gc_urgent_high_remaining) { sbi->gc_urgent_high_remaining--; + if (!sbi->gc_urgent_high_remaining) + sbi->gc_mode = GC_NORMAL; } spin_unlock(&sbi->gc_urgent_high_lock); } @@ -285,7 +282,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, /* let's select beginning hot/small space first in no_heap mode*/ if (f2fs_need_rand_seg(sbi)) - p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; @@ -1082,7 +1079,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node; + unsigned int ofs_in_node, max_addrs; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -1108,6 +1105,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } + max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : + DEF_ADDRS_PER_BLOCK; + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", + ofs_in_node, dni->ino, dni->nid, max_addrs); + return false; + } + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); @@ -1159,6 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } goto got_it; @@ -1177,6 +1183,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } got_it: @@ -1206,8 +1213,8 @@ got_it: f2fs_put_page(fio.encrypted_page, 0); f2fs_put_page(page, 1); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); return 0; put_encrypted_page: @@ -1307,8 +1314,10 @@ static int move_data_block(struct inode *inode, block_t bidx, goto up_out; } - f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, + F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, + F2FS_BLKSIZE); lock_page(mpage); if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || @@ -1360,7 +1369,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_page_out; } - f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -1706,7 +1715,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); goto skip; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bf46a7dfbea2..21a495234ffd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -64,7 +64,6 @@ bool f2fs_may_inline_dentry(struct inode *inode) void f2fs_do_read_inline_data(struct page *page, struct page *ipage) { struct inode *inode = page->mapping->host; - void *src_addr, *dst_addr; if (PageUptodate(page)) return; @@ -74,11 +73,8 @@ void f2fs_do_read_inline_data(struct page *page, struct page *ipage) zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(inode, ipage); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - flush_dcache_page(page); - kunmap_atomic(dst_addr); + memcpy_to_page(page, 0, inline_data_addr(inode, ipage), + MAX_INLINE_DATA(inode)); if (!PageUptodate(page)) SetPageUptodate(page); } @@ -164,6 +160,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_sbi_flag(fio.sbi, SBI_NEED_FSCK); f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dn->inode->i_ino, dn->data_blkaddr); + f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -246,7 +243,6 @@ out: int f2fs_write_inline_data(struct inode *inode, struct page *page) { - void *src_addr, *dst_addr; struct dnode_of_data dn; int err; @@ -263,10 +259,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true); - src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(inode, dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - kunmap_atomic(src_addr); + memcpy_from_page(inline_data_addr(inode, dn.inode_page), + page, 0, MAX_INLINE_DATA(inode)); set_page_dirty(dn.inode_page); f2fs_clear_page_cache_dirty_tag(page); @@ -419,6 +413,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); + f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; goto out; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6d11c365d7b4..9f0d3864d9f1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -81,8 +81,10 @@ static int __written_first_block(struct f2fs_sb_info *sbi, if (!__is_valid_data_blkaddr(addr)) return 1; - if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } return 0; } @@ -333,6 +335,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return true; } +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -405,6 +417,7 @@ static int do_read_inode(struct inode *inode) if (!sanity_check_inode(inode, node_page)) { f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } @@ -465,10 +478,7 @@ static int do_read_inode(struct inode *inode) } } - fi->i_disk_time[0] = inode->i_atime; - fi->i_disk_time[1] = inode->i_ctime; - fi->i_disk_time[2] = inode->i_mtime; - fi->i_disk_time[3] = fi->i_crtime; + init_idisk_time(inode); f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -480,6 +490,12 @@ static int do_read_inode(struct inode *inode) return 0; } +static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -491,16 +507,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return ERR_PTR(ret); + } + trace_f2fs_iget(inode); return inode; } - if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) - goto make_now; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (ino == F2FS_COMPRESS_INO(sbi)) + if (is_meta_ino(sbi, ino)) goto make_now; -#endif ret = do_read_inode(inode); if (ret) @@ -676,11 +698,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_page_private_inline(node_page); - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; - + init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); #endif @@ -699,7 +717,8 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_UPDATE_INODE); } return; } diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index d84c5f6cc09d..3166a8939ed4 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -31,55 +31,65 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) /* print app write IOs */ seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_IO]); /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", + sbi->rw_iostat[FS_CDATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", + seq_printf(seq, "fs gc node: %-16llu\n", sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", + seq_printf(seq, "fs cp data: %-16llu\n", sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", + seq_printf(seq, "fs cp node: %-16llu\n", sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", + seq_printf(seq, "fs cp meta: %-16llu\n", sbi->rw_iostat[FS_CP_META_IO]); /* print app read IOs */ seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_READ_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]); /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs compr_data: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_READ_IO]); /* print other IOs */ seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", + seq_printf(seq, "fs discard: %-16llu\n", sbi->rw_iostat[FS_DISCARD]); return 0; @@ -159,7 +169,7 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) spin_unlock_irq(&sbi->iostat_lat_lock); } -void f2fs_update_iostat(struct f2fs_sb_info *sbi, +void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) { unsigned long flags; @@ -176,6 +186,28 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) sbi->rw_iostat[APP_READ_IO] += io_bytes; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (inode && f2fs_compressed_file(inode)) { + if (type == APP_BUFFERED_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes; + + if (type == APP_BUFFERED_READ_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_READ_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes; + + if (type == FS_DATA_READ_IO) + sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes; + + if (type == FS_DATA_IO) + sbi->rw_iostat[FS_CDATA_IO] += io_bytes; + } +#endif + spin_unlock_irqrestore(&sbi->iostat_lock, flags); f2fs_record_iostat(sbi); diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h index 22a2d01f57ef..2c048307b6e0 100644 --- a/fs/f2fs/iostat.h +++ b/fs/f2fs/iostat.h @@ -31,7 +31,7 @@ struct iostat_lat_info { extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset); extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); -extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes); struct bio_iostat_ctx { @@ -65,7 +65,7 @@ extern void f2fs_destroy_iostat_processing(void); extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); #else -static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) {} static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index bf00d5057abb..a389772fd212 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -50,7 +50,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; @@ -845,7 +845,7 @@ out: } static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, bool is_whiteout, + struct file *file, umode_t mode, bool is_whiteout, struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -892,8 +892,8 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } else { - if (dentry) - d_tmpfile(dentry, inode); + if (file) + d_tmpfile(file, inode); else f2fs_i_links_write(inode, false); } @@ -915,16 +915,19 @@ out: } static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); + err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL); + + return finish_open_simple(file, err); } static int f2fs_create_whiteout(struct user_namespace *mnt_userns, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e06a0c478b39..983572f23896 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -36,6 +36,7 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } return 0; @@ -585,7 +586,7 @@ retry: ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - up_read(&curseg->journal_rwsem); + up_read(&curseg->journal_rwsem); if (i >= 0) { f2fs_up_read(&nm_i->nat_tree_lock); goto cache; @@ -1295,6 +1296,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) if (unlikely(new_ni.blk_addr != NULL_ADDR)) { err = -EFSCORRUPTED; set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } #endif @@ -1369,7 +1371,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags) err = f2fs_submit_page_bio(&fio); if (!err) - f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE); return err; } @@ -2147,8 +2149,7 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, if (IS_INODE(&folio->page)) f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); #endif - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); set_page_private_reference(&folio->page); return true; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index dcd0a1e35095..dea95b48b647 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -474,7 +474,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; - unsigned int offset; + unsigned int offset, ofs_in_node, max_addrs; block_t bidx; int i; @@ -501,15 +501,25 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); + ofs_in_node = le16_to_cpu(sum.ofs_in_node); + + max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode); + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + ofs_in_node, dn->inode->i_ino, nid, max_addrs); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); + return -EFSCORRUPTED; + } + if (dn->inode->i_ino == nid) { tdn.nid = nid; if (!dn->inode_page_locked) lock_page(dn->inode_page); tdn.node_page = dn->inode_page; - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } else if (dn->nid == nid) { - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } @@ -628,6 +638,7 @@ retry_dn: inode->i_ino, ofs_of_node(dn.node_page), ofs_of_node(page)); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } @@ -640,12 +651,14 @@ retry_dn: if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } if (__is_valid_data_blkaddr(dest) && !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } @@ -698,6 +711,16 @@ retry_prev: goto err; } + if (f2fs_is_valid_blkaddr(sbi, dest, + DATA_GENERIC_ENHANCE_UPDATE)) { + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + dest, inode->i_ino, dn.ofs_in_node); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto err; + } + /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, ni.version, false, false); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0de21f82d7bc..acf3d3fa4363 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -187,7 +187,6 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) void f2fs_abort_atomic_write(struct inode *inode, bool clean) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_is_atomic_file(inode)) @@ -200,10 +199,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) fi->cow_inode = NULL; release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_FILE); - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files--; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + stat_dec_atomic_inode(inode); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, @@ -312,6 +308,8 @@ static int __f2fs_commit_atomic_write(struct inode *inode) DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } @@ -376,7 +374,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); } /* balance_fs_bg is able to be pending */ @@ -476,12 +474,12 @@ do_sync: mutex_lock(&sbi->flush_lock); blk_start_plug(&plug); - f2fs_sync_dirty_inodes(sbi, FILE_INODE); + f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); blk_finish_plug(&plug); mutex_unlock(&sbi->flush_lock); } - f2fs_sync_fs(sbi->sb, true); + f2fs_sync_fs(sbi->sb, 1); stat_inc_bg_cp_count(sbi->stat_info); } @@ -694,7 +692,8 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) } while (ret && --count); if (ret) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FLUSH_FAIL); break; } @@ -1171,7 +1170,7 @@ submit: atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, FS_DISCARD, 1); + f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1); lstart += len; start += len; @@ -2535,7 +2534,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) sanity_check_seg_type(sbi, seg_type); if (f2fs_need_rand_seg(sbi)) - return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2589,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = - prandom_u32() % sbi->max_fragment_chunk + 1; + prandom_u32_max(sbi->max_fragment_chunk) + 1; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2626,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk <= 0) { seg->fragment_remained_chunk = - prandom_u32() % sbi->max_fragment_chunk + 1; + prandom_u32_max(sbi->max_fragment_chunk) + 1; seg->next_blkoff += - prandom_u32() % sbi->max_fragment_hole + 1; + prandom_u32_max(sbi->max_fragment_hole) + 1; } } } @@ -3388,7 +3387,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, page->index); - f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -3398,7 +3397,7 @@ void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, @@ -3412,7 +3411,7 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); - f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) @@ -3432,6 +3431,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } @@ -3453,7 +3453,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) if (!err) { f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->page->mapping->host, + fio->io_type, F2FS_BLKSIZE); } return err; @@ -4379,6 +4380,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } @@ -4415,6 +4418,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } @@ -4434,6 +4438,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } @@ -4465,6 +4470,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } @@ -4473,6 +4479,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } @@ -4623,6 +4630,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Current segment has invalid alloc_type:%d", curseg->alloc_type); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } @@ -4640,6 +4648,7 @@ out: "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d1d63766f2c7..be8f2d7d007b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -753,6 +753,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } @@ -767,6 +768,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } return 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 26817b5aeac7..3834ead04620 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -301,10 +301,10 @@ static void f2fs_destroy_casefold_cache(void) { } static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count << 1) / 1000, + block_t limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); - /* limit is 0.2% */ + /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && F2FS_OPTION(sbi).root_reserved_blocks > limit) { F2FS_OPTION(sbi).root_reserved_blocks = limit; @@ -1342,6 +1342,11 @@ default_check: return -EINVAL; } + if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with ATGC"); + return -EINVAL; + } + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1666,9 +1671,8 @@ static int f2fs_freeze(struct super_block *sb) if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; - /* ensure no checkpoint required */ - if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) - return -EINVAL; + /* Let's flush checkpoints and stop the thread. */ + f2fs_flush_ckpt_thread(F2FS_SB(sb)); /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); @@ -2181,6 +2185,9 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); + + /* Let's ensure there's no pending checkpoint anymore */ + f2fs_flush_ckpt_thread(sbi); } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -2346,6 +2353,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) f2fs_stop_ckpt_thread(sbi); need_restart_ckpt = true; } else { + /* Flush if the prevous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, @@ -2465,7 +2475,6 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, size_t toread; loff_t i_size = i_size_read(inode); struct page *page; - char *kaddr; if (off > i_size) return 0; @@ -2498,9 +2507,7 @@ repeat: return -EIO; } - kaddr = kmap_atomic(page); - memcpy(data, kaddr + offset, tocopy); - kunmap_atomic(kaddr); + memcpy_from_page(data, page, offset, tocopy); f2fs_put_page(page, 1); offset = 0; @@ -2522,7 +2529,6 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, size_t towrite = len; struct page *page; void *fsdata = NULL; - char *kaddr; int err = 0; int tocopy; @@ -2541,10 +2547,7 @@ retry: break; } - kaddr = kmap_atomic(page); - memcpy(kaddr + offset, data, tocopy); - kunmap_atomic(kaddr); - flush_dcache_page(page); + memcpy_to_page(page, offset, data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, page, fsdata); @@ -3843,6 +3846,68 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int err; + + f2fs_down_write(&sbi->sb_lock); + + if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + raw_super->s_stop_reason[reason]++; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", + reason, err); + f2fs_up_write(&sbi->sb_lock); +} + +static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +{ + spin_lock(&sbi->error_lock); + if (!test_bit(flag, (unsigned long *)sbi->errors)) { + set_bit(flag, (unsigned long *)sbi->errors); + sbi->error_dirty = true; + } + spin_unlock(&sbi->error_lock); +} + +static bool f2fs_update_errors(struct f2fs_sb_info *sbi) +{ + bool need_update = false; + + spin_lock(&sbi->error_lock); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + need_update = true; + } + spin_unlock(&sbi->error_lock); + + return need_update; +} + +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + int err; + + f2fs_save_errors(sbi, error); + + f2fs_down_write(&sbi->sb_lock); + + if (!f2fs_update_errors(sbi)) + goto out_unlock; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); +out_unlock: + f2fs_up_write(&sbi->sb_lock); +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -4190,6 +4255,9 @@ try_onemore: goto free_devices; } + spin_lock_init(&sbi->error_lock); + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index eba5fb1629d7..df27afd71ef4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -128,6 +128,12 @@ static ssize_t sb_status_show(struct f2fs_attr *a, return sprintf(buf, "%lx\n", sbi->s_flag); } +static ssize_t cp_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); +} + static ssize_t pending_discard_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -527,7 +533,6 @@ out: if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_limited = t != 0; sbi->gc_urgent_high_remaining = t; spin_unlock(&sbi->gc_urgent_high_lock); @@ -1030,8 +1035,10 @@ static struct attribute *f2fs_feat_attrs[] = { ATTRIBUTE_GROUPS(f2fs_feat); F2FS_GENERAL_RO_ATTR(sb_status); +F2FS_GENERAL_RO_ATTR(cp_status); static struct attribute *f2fs_stat_attrs[] = { ATTR_LIST(sb_status), + ATTR_LIST(cp_status), NULL, }; ATTRIBUTE_GROUPS(f2fs_stat); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 7b8f2b41c29b..c352fff88a5e 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -47,16 +47,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *addr; page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(page)) return PTR_ERR(page); - addr = kmap_atomic(page); - memcpy(buf, addr + offset_in_page(pos), n); - kunmap_atomic(addr); + memcpy_from_page(buf, page, offset_in_page(pos), n); put_page(page); @@ -85,16 +82,13 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata; - void *addr; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); if (res) return res; - addr = kmap_atomic(page); - memcpy(addr + offset_in_page(pos), buf, n); - kunmap_atomic(addr); + memcpy_to_page(page, offset_in_page(pos), buf, n); res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); if (res < 0) @@ -246,6 +240,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_VERITY_XATTR); return -EFSCORRUPTED; } if (buf_size) { @@ -262,13 +258,14 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c76c15086e5f..dc2e8637189e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -367,6 +367,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto out; } check: @@ -583,6 +585,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto cleanup; } @@ -658,6 +662,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } @@ -684,6 +690,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino, ENTRY_SIZE(last)); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } last = XATTR_NEXT_ENTRY(last); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a38238d75c08..1cbcc4608dc7 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -523,7 +523,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { inode->i_generation &= ~1; diff --git a/fs/file.c b/fs/file.c index 3bcc1ecc314a..5f9c802a5d8d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -980,6 +980,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret *ret_fd = fd; return file; } +EXPORT_SYMBOL(task_lookup_next_fd_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 51897427a534..b4a6e0a1b945 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -776,7 +776,8 @@ static int fuse_check_page(struct page *page) 1 << PG_active | 1 << PG_workingset | 1 << PG_reclaim | - 1 << PG_waiters))) { + 1 << PG_waiters | + LRU_GEN_MASK | LRU_REFS_MASK))) { dump_page(page, "fuse: trying to steal weird page"); return 1; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b585b04e815e..bb97a384dc5d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -529,7 +529,7 @@ out_err: */ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, - umode_t mode) + umode_t mode, u32 opcode) { int err; struct inode *inode; @@ -573,7 +573,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } - args.opcode = FUSE_CREATE; + args.opcode = opcode; args.nodeid = get_node_id(dir); args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); @@ -676,7 +676,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode); + err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -802,6 +802,23 @@ static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, return fuse_mknod(&init_user_ns, dir, entry, mode, 0); } +static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + int err; + + if (fc->no_tmpfile) + return -EOPNOTSUPP; + + err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + if (err == -ENOSYS) { + fc->no_tmpfile = 1; + err = -EOPNOTSUPP; + } + return err; +} + static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -1913,6 +1930,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .setattr = fuse_setattr, .create = fuse_create, .atomic_open = fuse_atomic_open, + .tmpfile = fuse_tmpfile, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1a3afd469e3a..71bfb663aac5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3001,6 +3001,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; } + err = file_modified(file); + if (err) + goto out; + if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 488b460e046f..98a9cf531873 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -784,6 +784,9 @@ struct fuse_conn { /* Does the filesystem support per inode DAX? */ unsigned int inode_dax:1; + /* Is tmpfile not implemented by fs? */ + unsigned int no_tmpfile:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index b4e565711045..e8deaacf1832 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -77,8 +77,10 @@ static void fuse_add_dirent_to_cache(struct file *file, goto unlock; addr = kmap_local_page(page); - if (!offset) + if (!offset) { clear_page(addr); + SetPageUptodate(page); + } memcpy(addr + offset, dirent, reclen); kunmap_local(addr); fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; @@ -516,6 +518,12 @@ retry_locked: page = find_get_page_flags(file->f_mapping, index, FGP_ACCESSED | FGP_LOCK); + /* Page gone missing, then re-added to cache, but not initialized? */ + if (page && !PageUptodate(page)) { + unlock_page(page); + put_page(page); + page = NULL; + } spin_lock(&fi->rdc.lock); if (!page) { /* diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 892006fbbb09..60c6fb91fb58 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1443,6 +1443,22 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); } +static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) +{ + struct gfs2_glock *gl = fl_gh->gh_gl; + + /* + * Make sure gfs2_glock_put() won't sleep under the file->f_lock + * spinlock. + */ + + gfs2_glock_hold(gl); + spin_lock(&file->f_lock); + gfs2_holder_uninit(fl_gh); + spin_unlock(&file->f_lock); + gfs2_glock_put(gl); +} + static int do_flock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_file *fp = file->private_data; @@ -1455,7 +1471,9 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int sleeptime; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT; + flags = GL_EXACT | GL_NOPID; + if (!IS_SETLKW(cmd)) + flags |= LM_FLAG_TRY_1CB; mutex_lock(&fp->f_fl_mutex); @@ -1474,18 +1492,21 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) &gfs2_flock_glops, CREATE, &gl); if (error) goto out; + spin_lock(&file->f_lock); gfs2_holder_init(gl, state, flags, fl_gh); + spin_unlock(&file->f_lock); gfs2_glock_put(gl); } for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { error = gfs2_glock_nq(fl_gh); if (error != GLR_TRYFAILED) break; - fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; + fl_gh->gh_flags &= ~LM_FLAG_TRY_1CB; + fl_gh->gh_flags |= LM_FLAG_TRY; msleep(sleeptime); } if (error) { - gfs2_holder_uninit(fl_gh); + __flock_holder_uninit(file, fl_gh); if (error == GLR_TRYFAILED) error = -EAGAIN; } else { @@ -1507,7 +1528,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) locks_lock_file_wait(file, fl); if (gfs2_holder_initialized(fl_gh)) { gfs2_glock_dq(fl_gh); - gfs2_holder_uninit(fl_gh); + __flock_holder_uninit(file, fl_gh); } mutex_unlock(&fp->f_fl_mutex); } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 41b6c89e4bf7..df335c258eb0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -33,6 +33,9 @@ #include <linux/list_sort.h> #include <linux/lockref.h> #include <linux/rhashtable.h> +#include <linux/pid_namespace.h> +#include <linux/fdtable.h> +#include <linux/file.h> #include "gfs2.h" #include "incore.h" @@ -59,6 +62,8 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl); static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static void __gfs2_glock_dq(struct gfs2_holder *gh); +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + unsigned long delay, bool remote); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; @@ -730,7 +735,8 @@ static bool is_system_glock(struct gfs2_glock *gl) * */ -static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, + unsigned int target) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { @@ -741,7 +747,8 @@ __acquires(&gl->gl_lockref.lock) if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && gh && !(gh->gh_flags & LM_FLAG_NOEXP)) - return; + goto skip_inval; + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); GLOCK_BUG_ON(gl, gl->gl_state == target); @@ -826,6 +833,20 @@ skip_inval: (target != LM_ST_UNLOCKED || test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) { if (!is_system_glock(gl)) { + handle_callback(gl, LM_ST_UNLOCKED, 0, false); /* sets demote */ + /* + * Ordinarily, we would call dlm and its callback would call + * finish_xmote, which would call state_change() to the new state. + * Since we withdrew, we won't call dlm, so call state_change + * manually, but to the UNLOCKED state we desire. + */ + state_change(gl, LM_ST_UNLOCKED); + /* + * We skip telling dlm to do the locking, so we won't get a + * reply that would otherwise clear GLF_LOCK. So we clear it here. + */ + clear_bit(GLF_LOCK, &gl->gl_flags); + clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); goto out; } else { @@ -1018,16 +1039,18 @@ static void delete_work_func(struct work_struct *work) if (gfs2_queue_delete_work(gl, 5 * HZ)) return; } - goto out; } inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, GFS2_BLKST_UNLINKED); - if (!IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -EAGAIN && + (gfs2_queue_delete_work(gl, 5 * HZ))) + return; + } else { d_prune_aliases(inode); iput(inode); } -out: gfs2_glock_put(gl); } @@ -1436,6 +1459,15 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) va_end(args); } +static inline bool pid_is_meaningful(const struct gfs2_holder *gh) +{ + if (!(gh->gh_flags & GL_NOPID)) + return true; + if (gh->gh_state == LM_ST_UNLOCKED) + return true; + return false; +} + /** * add_to_queue - Add a holder to the wait queue (but look for recursion) * @gh: the holder structure to add @@ -1472,10 +1504,17 @@ __acquires(&gl->gl_lockref.lock) } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { - if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && - (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) && - !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))) - goto trap_recursive; + if (likely(gh2->gh_owner_pid != gh->gh_owner_pid)) + continue; + if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK) + continue; + if (test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)) + continue; + if (!pid_is_meaningful(gh2)) + continue; + goto trap_recursive; + } + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (try_futile && !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { fail: @@ -2194,6 +2233,20 @@ static void dump_glock_func(struct gfs2_glock *gl) dump_glock(NULL, gl, true); } +static void withdraw_dq(struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_lockref.lock); + if (!__lockref_is_dead(&gl->gl_lockref) && + glock_blocked_by_withdraw(gl)) + do_error(gl, LM_OUT_ERROR); /* remove pending waiters */ + spin_unlock(&gl->gl_lockref.lock); +} + +void gfs2_gl_dq_holders(struct gfs2_sbd *sdp) +{ + glock_hash_walk(withdraw_dq, sdp); +} + /** * gfs2_gl_hash_clear - Empty out the glock hash table * @sdp: the filesystem @@ -2272,19 +2325,24 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh, const char *fs_id_buf) { - struct task_struct *gh_owner = NULL; + const char *comm = "(none)"; + pid_t owner_pid = 0; char flags_buf[32]; rcu_read_lock(); - if (gh->gh_owner_pid) + if (pid_is_meaningful(gh)) { + struct task_struct *gh_owner; + + comm = "(ended)"; + owner_pid = pid_nr(gh->gh_owner_pid); gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); + if (gh_owner) + comm = gh_owner->comm; + } gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n", fs_id_buf, state2str(gh->gh_state), hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), - gh->gh_error, - gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, - gh_owner ? gh_owner->comm : "(ended)", - (void *)gh->gh_ip); + gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip); rcu_read_unlock(); } @@ -2699,6 +2757,172 @@ static const struct file_operations gfs2_glstats_fops = { .release = gfs2_glocks_release, }; +struct gfs2_glockfd_iter { + struct super_block *sb; + unsigned int tgid; + struct task_struct *task; + unsigned int fd; + struct file *file; +}; + +static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct pid *pid; + + if (i->task) + put_task_struct(i->task); + + rcu_read_lock(); +retry: + i->task = NULL; + pid = find_ge_pid(i->tgid, ns); + if (pid) { + i->tgid = pid_nr_ns(pid, ns); + i->task = pid_task(pid, PIDTYPE_TGID); + if (!i->task) { + i->tgid++; + goto retry; + } + get_task_struct(i->task); + } + rcu_read_unlock(); + return i->task; +} + +static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) +{ + if (i->file) { + fput(i->file); + i->file = NULL; + } + + rcu_read_lock(); + for(;; i->fd++) { + struct inode *inode; + + i->file = task_lookup_next_fd_rcu(i->task, &i->fd); + if (!i->file) { + i->fd = 0; + break; + } + inode = file_inode(i->file); + if (inode->i_sb != i->sb) + continue; + if (get_file_rcu(i->file)) + break; + } + rcu_read_unlock(); + return i->file; +} + +static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (*pos) + return NULL; + while (gfs2_glockfd_next_task(i)) { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } + return NULL; +} + +static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + (*pos)++; + i->fd++; + do { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } while (gfs2_glockfd_next_task(i)); + return NULL; +} + +static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (i->file) + fput(i->file); + if (i->task) + put_task_struct(i->task); +} + +static void gfs2_glockfd_seq_show_flock(struct seq_file *seq, + struct gfs2_glockfd_iter *i) +{ + struct gfs2_file *fp = i->file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED }; + + if (!READ_ONCE(fl_gh->gh_gl)) + return; + + spin_lock(&i->file->f_lock); + if (gfs2_holder_initialized(fl_gh)) + gl_name = fl_gh->gh_gl->gl_name; + spin_unlock(&i->file->f_lock); + + if (gl_name.ln_type != LM_TYPE_RESERVED) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl_name.ln_type, + (unsigned long long)gl_name.ln_number); + } +} + +static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + struct inode *inode = file_inode(i->file); + struct gfs2_glock *gl; + + inode_lock_shared(inode); + gl = GFS2_I(inode)->i_iopen_gh.gh_gl; + if (gl) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + } + gfs2_glockfd_seq_show_flock(seq, i); + inode_unlock_shared(inode); + return 0; +} + +static const struct seq_operations gfs2_glockfd_seq_ops = { + .start = gfs2_glockfd_seq_start, + .next = gfs2_glockfd_seq_next, + .stop = gfs2_glockfd_seq_stop, + .show = gfs2_glockfd_seq_show, +}; + +static int gfs2_glockfd_open(struct inode *inode, struct file *file) +{ + struct gfs2_glockfd_iter *i; + struct gfs2_sbd *sdp = inode->i_private; + + i = __seq_open_private(file, &gfs2_glockfd_seq_ops, + sizeof(struct gfs2_glockfd_iter)); + if (!i) + return -ENOMEM; + i->sb = sdp->sd_vfs; + return 0; +} + +static const struct file_operations gfs2_glockfd_fops = { + .owner = THIS_MODULE, + .open = gfs2_glockfd_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) @@ -2708,6 +2932,9 @@ void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glocks_fops); + debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glockfd_fops); + debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glstats_fops); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 5aed8b500cf5..0d068f4fd7d6 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -91,6 +91,7 @@ enum { #define GL_ASYNC 0x0040 #define GL_EXACT 0x0080 #define GL_SKIP 0x0100 +#define GL_NOPID 0x0200 #define GL_NOCACHE 0x0400 /* @@ -274,6 +275,7 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); extern void gfs2_glock_free(struct gfs2_glock *gl); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c8ec876f33ea..04a201584fa7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -130,6 +130,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; + int extra_flags = 0; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); @@ -141,9 +142,12 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; - if (blktype != GFS2_BLKST_UNLINKED) + if (blktype == GFS2_BLKST_UNLINKED) + extra_flags |= LM_FLAG_TRY; + else gfs2_cancel_delete_work(io_gl); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, + GL_EXACT | GL_NOPID | extra_flags, &ip->i_iopen_gh); gfs2_glock_put(io_gl); if (unlikely(error)) @@ -210,6 +214,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail: + if (error == GLR_TRYFAILED) + error = -EAGAIN; if (gfs2_holder_initialized(&ip->i_iopen_gh)) gfs2_glock_dq_uninit(&ip->i_iopen_gh); if (gfs2_holder_initialized(&i_gh)) @@ -720,7 +726,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); BUG_ON(error); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID, + &ip->i_iopen_gh); if (error) goto fail_gunlock2; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 14ae9de76277..afcb32854f14 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -151,14 +151,6 @@ static int __init init_gfs2_fs(void) if (error) goto fail_shrinker; - error = register_filesystem(&gfs2_fs_type); - if (error) - goto fail_fs1; - - error = register_filesystem(&gfs2meta_fs_type); - if (error) - goto fail_fs2; - error = -ENOMEM; gfs_recovery_wq = alloc_workqueue("gfs_recovery", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); @@ -180,11 +172,23 @@ static int __init init_gfs2_fs(void) goto fail_mempool; gfs2_register_debugfs(); + error = register_filesystem(&gfs2_fs_type); + if (error) + goto fail_fs1; + + error = register_filesystem(&gfs2meta_fs_type); + if (error) + goto fail_fs2; + pr_info("GFS2 installed\n"); return 0; +fail_fs2: + unregister_filesystem(&gfs2_fs_type); +fail_fs1: + mempool_destroy(gfs2_page_pool); fail_mempool: destroy_workqueue(gfs2_freeze_wq); fail_wq3: @@ -192,10 +196,6 @@ fail_wq3: fail_wq2: destroy_workqueue(gfs_recovery_wq); fail_wq1: - unregister_filesystem(&gfs2meta_fs_type); -fail_fs2: - unregister_filesystem(&gfs2_fs_type); -fail_fs1: unregister_shrinker(&gfs2_qd_shrinker); fail_shrinker: kmem_cache_destroy(gfs2_trans_cachep); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 7e70e0ba5a6c..6ed728aae9a5 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -525,8 +525,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; - if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh); + bh_read_nowait(first_bh, REQ_META | REQ_PRIO); dblock++; extlen--; @@ -534,9 +533,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) while (extlen) { bh = gfs2_getbuf(gl, dblock, CREATE); - if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META | - REQ_PRIO, 1, &bh); + bh_readahead(bh, REQ_RAHEAD | REQ_META | REQ_PRIO); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 549879929c84..c0cf1d2d0ef5 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -178,7 +178,10 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) pr_warn("Invalid block size\n"); return -EINVAL; } - + if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) { + pr_warn("Invalid block size shift\n"); + return -EINVAL; + } return 0; } @@ -381,8 +384,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent) if (!table[0]) table = sdp->sd_vfs->s_id; - strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN); - strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN); + BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN); + + strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN); + strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN); table = sdp->sd_table_name; while ((table = strchr(table, '/'))) @@ -401,7 +406,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, error = gfs2_glock_nq_num(sdp, GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, - LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, mount_gh); if (error) { fs_err(sdp, "can't acquire mount glock: %d\n", error); @@ -411,7 +417,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, error = gfs2_glock_nq_num(sdp, GFS2_LIVE_LOCK, &gfs2_nondisk_glops, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, &sdp->sd_live_gh); if (error) { fs_err(sdp, "can't acquire live glock: %d\n", error); @@ -687,7 +693,7 @@ static int init_statfs(struct gfs2_sbd *sdp) iput(pn); pn = NULL; ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, &sdp->sd_sc_gh); if (error) { fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); @@ -776,7 +782,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, &gfs2_journal_glops, LM_ST_EXCLUSIVE, - LM_FLAG_NOEXP | GL_NOCACHE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, &sdp->sd_journal_gh); if (error) { fs_err(sdp, "can't acquire journal glock: %d\n", error); @@ -786,7 +792,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) ip = GFS2_I(sdp->sd_jdesc->jd_inode); sdp->sd_jinode_gl = ip->i_gl; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE, + LM_FLAG_NOEXP | GL_EXACT | + GL_NOCACHE | GL_NOPID, &sdp->sd_jinode_gh); if (error) { fs_err(sdp, "can't acquire journal inode glock: %d\n", @@ -957,7 +964,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) pn = NULL; ip = GFS2_I(sdp->sd_qc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); @@ -1439,13 +1446,13 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (o) { case Opt_lockproto: - strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); break; case Opt_locktable: - strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); break; case Opt_hostdata: - strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); break; case Opt_spectator: args->ar_spectator = 1; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index f201eaf59d0d..1ed17226d9ed 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -745,12 +745,8 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, } if (PageUptodate(page)) set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - goto unlock_out; - } + if (bh_read(bh, REQ_META | REQ_PRIO) < 0) + goto unlock_out; if (gfs2_is_jdata(ip)) gfs2_trans_add_data(ip->i_gl, bh); else diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b5b0f285b27f..b018957a1bb2 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -346,7 +346,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) } error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, - LM_FLAG_NOEXP, &sdp->sd_freeze_gh); + LM_FLAG_NOEXP | GL_NOPID, + &sdp->sd_freeze_gh); if (error) goto out; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 8241029a2a5d..7a6aeffcdf5c 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -164,6 +164,11 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) } if (!ret) gfs2_make_fs_ro(sdp); + /* + * Dequeue any pending non-system glock holders that can no + * longer be granted because the file system is withdrawn. + */ + gfs2_gl_dq_holders(sdp); gfs2_freeze_unlock(&freeze_gh); } @@ -204,6 +209,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) * exception code in glock_dq. */ iput(inode); + sdp->sd_jdesc->jd_inode = NULL; /* * Wait until the journal inode's glock is freed. This allows try locks * on other nodes to be successful, otherwise we remain the owner of @@ -226,7 +232,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) */ fs_warn(sdp, "Requesting recovery of jid %d.\n", sdp->sd_lockstruct.ls_jid); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | LM_FLAG_NOEXP, + gfs2_holder_reinit(LM_ST_EXCLUSIVE, + LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | GL_NOPID, &sdp->sd_live_gh); msleep(GL_GLOCK_MAX_HOLD); /* @@ -251,7 +258,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) fs_warn(sdp, "Unable to recover our journal jid %d.\n", sdp->sd_lockstruct.ls_jid); gfs2_glock_dq_wait(&sdp->sd_live_gh); - gfs2_holder_reinit(LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT, + gfs2_holder_reinit(LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, &sdp->sd_live_gh); gfs2_glock_nq(&sdp->sd_live_gh); } diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index c83fd0e8404d..2015e42e752a 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -21,7 +21,6 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) int pagenum; int bytes_read; int bytes_to_read; - void *vaddr; off += node->page_offset; pagenum = off >> PAGE_SHIFT; @@ -33,9 +32,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) page = node->page[pagenum]; bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off); - vaddr = kmap_atomic(page); - memcpy(buf + bytes_read, vaddr + off, bytes_to_read); - kunmap_atomic(vaddr); + memcpy_from_page(buf + bytes_read, page, off, bytes_to_read); pagenum++; off = 0; /* page offset only applies to the first page */ @@ -80,8 +77,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off += node->page_offset; page = node->page[0]; - memcpy(kmap(page) + off, buf, len); - kunmap(page); + memcpy_to_page(page, off, buf, len); set_page_dirty(page); } @@ -105,8 +101,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off += node->page_offset; page = node->page[0]; - memset(kmap(page) + off, 0, len); - kunmap(page); + memzero_page(page, off, len); set_page_dirty(page); } @@ -123,9 +118,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, src_page = src_node->page[0]; dst_page = dst_node->page[0]; - memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len); - kunmap(src_page); - kunmap(dst_page); + memcpy_page(dst_page, dst, src_page, src, len); set_page_dirty(dst_page); } @@ -140,9 +133,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) src += node->page_offset; dst += node->page_offset; page = node->page[0]; - ptr = kmap(page); + ptr = kmap_local_page(page); memmove(ptr + dst, ptr + src, len); - kunmap(page); + kunmap_local(ptr); set_page_dirty(page); } @@ -346,13 +339,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -436,14 +430,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min((int)PAGE_SIZE, (int)tree->node_size)); + memzero_page(*pagep, node->page_offset, + min((int)PAGE_SIZE, (int)tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 19017d296173..2fa4b1f8cc7f 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -80,7 +80,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); tree->leaf_head = be32_to_cpu(head->leaf_head); @@ -119,11 +120,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; @@ -169,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree) return; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); head->leaf_count = cpu_to_be32(tree->leaf_count); @@ -180,7 +183,7 @@ void hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); } @@ -268,7 +271,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -281,7 +284,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -290,14 +293,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { printk(KERN_DEBUG "create new bmap node...\n"); @@ -313,7 +316,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -360,20 +363,20 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { pr_crit("trying to free free bnode %u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index cebce0cfe340..bd8dcea85588 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -39,7 +39,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; i = offset % 32; offset &= ~(PAGE_CACHE_BITS - 1); @@ -74,7 +74,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, } curr++; } - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; if (offset >= size) break; @@ -84,7 +84,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - curr = pptr = kmap(page); + curr = pptr = kmap_local_page(page); if ((size ^ offset) / PAGE_CACHE_BITS) end = pptr + PAGE_CACHE_BITS / 32; else @@ -127,7 +127,7 @@ found: len -= 32; } set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -135,7 +135,7 @@ found: start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -151,7 +151,7 @@ last: done: *curr = cpu_to_be32(n); set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); @@ -185,7 +185,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) page = read_mapping_page(mapping, pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; end = pptr + PAGE_CACHE_BITS / 32; len = count; @@ -215,11 +215,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) break; set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); page = read_mapping_page(mapping, ++pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -231,7 +231,7 @@ done: } out: set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); sbi->free_blocks += len; hfsplus_mark_mdb_dirty(sb); mutex_unlock(&sbi->alloc_mutex); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index a5ab00e54220..87974d5e6791 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -29,14 +29,12 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(buf, kmap(*pagep) + off, l); - kunmap(*pagep); + memcpy_from_page(buf, *pagep, off, l); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(buf, kmap(*++pagep), l); - kunmap(*pagep); + memcpy_from_page(buf, *++pagep, 0, l); } } @@ -82,16 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(kmap(*pagep) + off, buf, l); + memcpy_to_page(*pagep, off, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++pagep), buf, l); + memcpy_to_page(*++pagep, 0, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -112,15 +108,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memset(kmap(*pagep) + off, 0, l); + memzero_page(*pagep, off, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memset(kmap(*++pagep), 0, l); + memzero_page(*++pagep, 0, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -142,24 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); - kunmap(*src_page); + memcpy_page(*dst_page, src, *src_page, src, l); set_page_dirty(*dst_page); - kunmap(*dst_page); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++dst_page), kmap(*++src_page), l); - kunmap(*src_page); + memcpy_page(*++dst_page, 0, *++src_page, 0, l); set_page_dirty(*dst_page); - kunmap(*dst_page); } } else { void *src_ptr, *dst_ptr; do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; src = 0; @@ -171,9 +161,9 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, } l = min(len, l); memcpy(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -185,6 +175,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) { struct page **src_page, **dst_page; + void *src_ptr, *dst_ptr; int l; hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); @@ -202,27 +193,28 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { while (src < len) { - memmove(kmap(*dst_page), kmap(*src_page), src); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr, src_ptr, src); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); len -= src; src = PAGE_SIZE; src_page--; dst_page--; } src -= len; - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, len); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr + src, src_ptr + src, len); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (src < dst) { l = src; src = PAGE_SIZE; @@ -234,9 +226,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr - l, src_ptr - l, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (dst == PAGE_SIZE) dst_page--; else @@ -251,26 +243,27 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, l); - kunmap(*src_page); + + dst_ptr = kmap_local_page(*dst_page) + src; + src_ptr = kmap_local_page(*src_page) + src; + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memmove(kmap(*++dst_page), - kmap(*++src_page), l); - kunmap(*src_page); + dst_ptr = kmap_local_page(*++dst_page); + src_ptr = kmap_local_page(*++src_page); + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; @@ -283,9 +276,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -498,14 +491,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + - node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -589,14 +582,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min_t(int, PAGE_SIZE, tree->node_size)); + memzero_page(*pagep, node->page_offset, + min_t(int, PAGE_SIZE, tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 66774f4cb4fd..9e1732a2b92a 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -163,7 +163,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); @@ -240,11 +240,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; @@ -291,7 +292,7 @@ int hfs_btree_write(struct hfs_btree *tree) return -EIO; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); @@ -303,7 +304,7 @@ int hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); return 0; @@ -394,7 +395,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -407,7 +408,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -417,14 +418,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { hfs_dbg(BNODE_MOD, "create new bmap node\n"); @@ -440,7 +441,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -490,7 +491,7 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; @@ -498,13 +499,13 @@ void hfs_bmap_free(struct hfs_bnode *node) pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 07881b76d42f..277468783fee 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -103,7 +103,7 @@ static char *__dentry_name(struct dentry *dentry, char *name) */ BUG_ON(p + strlen(p) + 1 != name + PATH_MAX); - strlcpy(name, root, PATH_MAX); + strscpy(name, root, PATH_MAX); if (len > p - name) { __putname(name); return NULL; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f7a5b5124d8a..dd54f67e47fd 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -364,13 +364,155 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void remove_huge_page(struct page *page) +static void hugetlb_delete_from_page_cache(struct page *page) { ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } +/* + * Called with i_mmap_rwsem held for inode based vma maps. This makes + * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault + * mutex for the page in the mapping. So, we can not race with page being + * faulted into the vma. + */ +static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + pte_t *ptep, pte; + + ptep = huge_pte_offset(vma->vm_mm, addr, + huge_page_size(hstate_vma(vma))); + + if (!ptep) + return false; + + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte) || !pte_present(pte)) + return false; + + if (pte_page(pte) == page) + return true; + + return false; +} + +/* + * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ +static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) +{ + if (vma->vm_pgoff < start) + return (start - vma->vm_pgoff) << PAGE_SHIFT; + else + return 0; +} + +static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) +{ + unsigned long t_end; + + if (!end) + return vma->vm_end; + + t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; + if (t_end > vma->vm_end) + t_end = vma->vm_end; + return t_end; +} + +/* + * Called with hugetlb fault mutex held. Therefore, no more mappings to + * this folio can be created while executing the routine. + */ +static void hugetlb_unmap_file_folio(struct hstate *h, + struct address_space *mapping, + struct folio *folio, pgoff_t index) +{ + struct rb_root_cached *root = &mapping->i_mmap; + struct hugetlb_vma_lock *vma_lock; + struct page *page = &folio->page; + struct vm_area_struct *vma; + unsigned long v_start; + unsigned long v_end; + pgoff_t start, end; + + start = index * pages_per_huge_page(h); + end = (index + 1) * pages_per_huge_page(h); + + i_mmap_lock_write(mapping); +retry: + vma_lock = NULL; + vma_interval_tree_foreach(vma, root, start, end - 1) { + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + + if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + continue; + + if (!hugetlb_vma_trylock_write(vma)) { + vma_lock = vma->vm_private_data; + /* + * If we can not get vma lock, we need to drop + * immap_sema and take locks in order. First, + * take a ref on the vma_lock structure so that + * we can be guaranteed it will not go away when + * dropping immap_sema. + */ + kref_get(&vma_lock->refs); + break; + } + + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, + NULL, ZAP_FLAG_DROP_MARKER); + hugetlb_vma_unlock_write(vma); + } + + i_mmap_unlock_write(mapping); + + if (vma_lock) { + /* + * Wait on vma_lock. We know it is still valid as we have + * a reference. We must 'open code' vma locking as we do + * not know if vma_lock is still attached to vma. + */ + down_write(&vma_lock->rw_sema); + i_mmap_lock_write(mapping); + + vma = vma_lock->vma; + if (!vma) { + /* + * If lock is no longer attached to vma, then just + * unlock, drop our reference and retry looking for + * other vmas. + */ + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + goto retry; + } + + /* + * vma_lock is still attached to vma. Check to see if vma + * still maps page and if so, unmap. + */ + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + unmap_hugepage_range(vma, vma->vm_start + v_start, + v_end, NULL, + ZAP_FLAG_DROP_MARKER); + + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + hugetlb_vma_unlock_write(vma); + + goto retry; + } +} + static void hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, zap_flags_t zap_flags) @@ -383,32 +525,66 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * an inclusive "last". */ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { - unsigned long v_offset; + unsigned long v_start; unsigned long v_end; + if (!hugetlb_vma_trylock_write(vma)) + continue; + + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, + NULL, zap_flags); + /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. + * Note that vma lock only exists for shared/non-private + * vmas. Therefore, lock is not held when calling + * unmap_hugepage_range for private vmas. */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; - - if (!end) - v_end = vma->vm_end; - else { - v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) - + vma->vm_start; - if (v_end > vma->vm_end) - v_end = vma->vm_end; - } + hugetlb_vma_unlock_write(vma); + } +} - unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, - NULL, zap_flags); +/* + * Called with hugetlb fault mutex held. + * Returns true if page was actually removed, false otherwise. + */ +static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, + struct address_space *mapping, + struct folio *folio, pgoff_t index, + bool truncate_op) +{ + bool ret = false; + + /* + * If folio is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) while holding + * the fault mutex. The mutex will prevent faults + * until we finish removing the folio. + */ + if (unlikely(folio_mapped(folio))) + hugetlb_unmap_file_folio(h, mapping, folio, index); + + folio_lock(folio); + /* + * We must remove the folio from page cache before removing + * the region/ reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the region/reserve + * map could fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. + */ + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + hugetlb_delete_from_page_cache(&folio->page); + ret = true; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, index, + index + 1, 1))) + hugetlb_fix_reserve_counts(inode); } + + folio_unlock(folio); + return ret; } /* @@ -418,10 +594,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve - * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents - * page faults in the truncated range by checking i_size. i_size is - * modified while holding i_mmap_rwsem. + * maps and global counts. Page faults can race with truncation. + * During faults, hugetlb_no_page() checks i_size before page allocation, + * and again after obtaining page table lock. It will 'back out' + * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map @@ -451,61 +627,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash = 0; index = folio->index; - if (!truncate_op) { - /* - * Only need to hold the fault mutex in the - * hole punch case. This prevents races with - * page faults. Races are not possible in the - * case of truncation. - */ - hash = hugetlb_fault_mutex_hash(mapping, index); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - } + hash = hugetlb_fault_mutex_hash(mapping, index); + mutex_lock(&hugetlb_fault_mutex_table[hash]); /* - * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) now after taking - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. - * - * This race can only happen in the hole punch case. - * Getting here in a truncate operation is a bug. + * Remove folio that was part of folio_batch. */ - if (unlikely(folio_mapped(folio))) { - BUG_ON(truncate_op); - - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_lock_write(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h), - ZAP_FLAG_DROP_MARKER); - i_mmap_unlock_write(mapping); - } - - folio_lock(folio); - /* - * We must free the huge page and remove from page - * cache (remove_huge_page) BEFORE removing the - * region/reserve map (hugetlb_unreserve_pages). In - * rare out of memory conditions, removal of the - * region/reserve map could fail. Correspondingly, - * the subpool and global reserve usage count can need - * to be adjusted. - */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - remove_huge_page(&folio->page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, - index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - } - - folio_unlock(folio); - if (!truncate_op) - mutex_unlock(&hugetlb_fault_mutex_table[hash]); + if (remove_inode_single_folio(h, inode, mapping, folio, + index, truncate_op)) + freed++; + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); cond_resched(); @@ -543,8 +675,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; - i_mmap_lock_write(mapping); i_size_write(inode, offset); + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, ZAP_FLAG_DROP_MARKER); @@ -703,11 +835,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* - * fault mutex taken here, protects against fault path - * and hole punch. inode_lock previously taken protects - * against truncation. - */ + /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -737,7 +865,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(page, addr, pages_per_huge_page(h)); __SetPageUptodate(page); - error = huge_add_to_page_cache(page, mapping, index); + error = hugetlb_add_to_page_cache(page, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, page); put_page(page); @@ -749,7 +877,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, SetHPageMigratable(page); /* - * unlock_page because locked by huge_add_to_page_cache() + * unlock_page because locked by hugetlb_add_to_page_cache() * put_page() due to reference from alloc_huge_page() */ unlock_page(page); @@ -885,33 +1013,18 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int do_hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t dev, - bool tmpfile) +static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; - int error = -ENOSPC; inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); - if (inode) { - dir->i_ctime = dir->i_mtime = current_time(dir); - if (tmpfile) { - d_tmpfile(dentry, inode); - } else { - d_instantiate(dentry, inode); - dget(dentry);/* Extra count - pin the dentry in core */ - } - error = 0; - } - return error; -} - -static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) -{ - return do_hugetlbfs_mknod(dir, dentry, mode, dev, false); + if (!inode) + return -ENOSPC; + dir->i_ctime = dir->i_mtime = current_time(dir); + d_instantiate(dentry, inode); + dget(dentry);/* Extra count - pin the dentry in core */ + return 0; } static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, @@ -932,10 +1045,17 @@ static int hugetlbfs_create(struct user_namespace *mnt_userns, } static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, - struct inode *dir, struct dentry *dentry, + struct inode *dir, struct file *file, umode_t mode) { - return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true); + struct inode *inode; + + inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); + if (!inode) + return -ENOSPC; + dir->i_ctime = dir->i_mtime = current_time(dir); + d_tmpfile(file, inode); + return finish_open_simple(file, 0); } static int hugetlbfs_symlink(struct user_namespace *mnt_userns, @@ -994,7 +1114,7 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping, struct inode *inode = mapping->host; pgoff_t index = page->index; - remove_huge_page(page); + hugetlb_delete_from_page_cache(page); if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index b466172eec25..c4da3f634b92 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -67,8 +67,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, for ( i = 0 ; i < pcount ; i++ ) { if (!pages[i]) continue; - memset(page_address(pages[i]), 0, PAGE_SIZE); - flush_dcache_page(pages[i]); + memzero_page(pages[i], 0, PAGE_SIZE); SetPageUptodate(pages[i]); } return ((loff_t)pcount) << PAGE_SHIFT; @@ -82,7 +81,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, return 0; } haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); - ll_rw_block(REQ_OP_READ, haveblocks, bhs); + bh_read_batch(haveblocks, bhs); curbh = 0; curpage = 0; @@ -120,7 +119,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, zerr != Z_STREAM_END) { if (!stream.avail_out) { if (pages[curpage]) { - stream.next_out = page_address(pages[curpage]) + stream.next_out = kmap_local_page(pages[curpage]) + poffset; stream.avail_out = PAGE_SIZE - poffset; poffset = 0; @@ -176,6 +175,10 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, flush_dcache_page(pages[curpage]); SetPageUptodate(pages[curpage]); } + if (stream.next_out != (unsigned char *)zisofs_sink_page) { + kunmap_local(stream.next_out); + stream.next_out = NULL; + } curpage++; } if (!stream.avail_in) @@ -183,6 +186,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, } inflate_out: zlib_inflateEnd(&stream); + if (stream.next_out && stream.next_out != (unsigned char *)zisofs_sink_page) + kunmap_local(stream.next_out); z_eio: mutex_unlock(&zisofs_zlib_lock); @@ -283,9 +288,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, } if (poffset && *pages) { - memset(page_address(*pages) + poffset, 0, - PAGE_SIZE - poffset); - flush_dcache_page(*pages); + memzero_page(*pages, poffset, PAGE_SIZE - poffset); SetPageUptodate(*pages); } return 0; @@ -343,10 +346,8 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) for (i = 0; i < pcount; i++, index++) { if (i != full_page) pages[i] = grab_cache_page_nowait(mapping, index); - if (pages[i]) { + if (pages[i]) ClearPageError(pages[i]); - kmap(pages[i]); - } } err = zisofs_fill_pages(inode, full_page, pcount, pages); @@ -357,7 +358,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) flush_dcache_page(pages[i]); if (i == full_page && err) SetPageError(pages[i]); - kunmap(pages[i]); unlock_page(pages[i]); if (i != full_page) put_page(pages[i]); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 88bf20303466..df9d70588b60 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1277,13 +1277,11 @@ static int isofs_read_level3_size(struct inode *inode) } while (more_entries); out: kfree(tmpde); - if (bh) - brelse(bh); + brelse(bh); return 0; out_nomem: - if (bh) - brelse(bh); + brelse(bh); return -ENOMEM; out_noread: @@ -1486,8 +1484,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) ret = 0; out: kfree(tmpde); - if (bh) - brelse(bh); + brelse(bh); return ret; out_badread: diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bc8270e0d7d0..2696f43e7239 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1898,19 +1898,16 @@ static int journal_get_superblock(journal_t *journal) { struct buffer_head *bh; journal_superblock_t *sb; - int err = -EIO; + int err; bh = journal->j_sb_buffer; J_ASSERT(bh != NULL); - if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - printk(KERN_ERR - "JBD2: IO error reading journal superblock\n"); - goto out; - } + err = bh_read(bh, 0); + if (err < 0) { + printk(KERN_ERR + "JBD2: IO error reading journal superblock\n"); + goto out; } if (buffer_verified(bh)) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 3688d16fe83b..8286a9ec122f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start) if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; if (nbufs == MAXBUF) { - ll_rw_block(REQ_OP_READ, nbufs, bufs); + bh_readahead_batch(nbufs, bufs, 0); journal_brelse_array(bufs, nbufs); nbufs = 0; } @@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start) } if (nbufs) - ll_rw_block(REQ_OP_READ, nbufs, bufs); + bh_readahead_batch(nbufs, bufs, 0); err = 0; failed: @@ -152,9 +152,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal, return -ENOMEM; if (!buffer_uptodate(bh)) { - /* If this is a brand new buffer, start readahead. - Otherwise, we assume we are already reading it. */ - if (!buffer_req(bh)) + /* + * If this is a brand new buffer, start readahead. + * Otherwise, we assume we are already reading it. + */ + bool need_readahead = !buffer_req(bh); + + bh_read_nowait(bh, 0); + if (need_readahead) do_readahead(journal, offset); wait_on_buffer(bh); } @@ -688,7 +693,6 @@ static int do_one_pass(journal_t *journal, mark_buffer_dirty(nbh); BUFFER_TRACE(nbh, "marking uptodate"); ++info->nr_replays; - /* ll_rw_block(WRITE, 1, &nbh); */ unlock_buffer(nbh); brelse(obh); brelse(nbh); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index c6821a509481..4061e0ba7010 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1035,7 +1035,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, { int i, ret; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - struct mtd_oob_ops ops; + struct mtd_oob_ops ops = { }; ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; @@ -1076,7 +1076,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { - struct mtd_oob_ops ops; + struct mtd_oob_ops ops = { }; int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); ops.mode = MTD_OPS_AUTO_OOB; @@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { int ret; - struct mtd_oob_ops ops; + struct mtd_oob_ops ops = { }; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); ops.mode = MTD_OPS_AUTO_OOB; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 1cc88ba6de90..3990f3e270cb 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -472,6 +472,16 @@ static void kernfs_drain(struct kernfs_node *kn) lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); + /* + * Skip draining if already fully drained. This avoids draining and its + * lockdep annotations for nodes which have never been activated + * allowing embedding kernfs_remove() in create error paths without + * worrying about draining. + */ + if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && + !kernfs_should_drain_open_files(kn)) + return; + up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { @@ -480,7 +490,6 @@ static void kernfs_drain(struct kernfs_node *kn) lock_contended(&kn->dep_map, _RET_IP_); } - /* but everyone should wait for draining */ wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); @@ -489,7 +498,8 @@ static void kernfs_drain(struct kernfs_node *kn) rwsem_release(&kn->dep_map, _RET_IP_); } - kernfs_drain_open_files(kn); + if (kernfs_should_drain_open_files(kn)) + kernfs_drain_open_files(kn); down_write(&root->kernfs_rwsem); } @@ -695,13 +705,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, goto err_unlock; } - /* - * ACTIVATED is protected with kernfs_mutex but it was clear when - * @kn was added to idr and we just wanna see it set. No need to - * grab kernfs_mutex. - */ - if (unlikely(!(kn->flags & KERNFS_ACTIVATED) || - !atomic_inc_not_zero(&kn->count))) + if (unlikely(!kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; spin_unlock(&kernfs_idr_lock); @@ -743,10 +747,7 @@ int kernfs_add_one(struct kernfs_node *kn) goto out_unlock; ret = -ENOENT; - if (parent->flags & KERNFS_EMPTY_DIR) - goto out_unlock; - - if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) + if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); @@ -1304,6 +1305,21 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, return pos->parent; } +static void kernfs_activate_one(struct kernfs_node *kn) +{ + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); + + kn->flags |= KERNFS_ACTIVATED; + + if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) + return; + + WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb)); + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); +} + /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated @@ -1325,15 +1341,42 @@ void kernfs_activate(struct kernfs_node *kn) down_write(&root->kernfs_rwsem); pos = NULL; - while ((pos = kernfs_next_descendant_post(pos, kn))) { - if (pos->flags & KERNFS_ACTIVATED) - continue; + while ((pos = kernfs_next_descendant_post(pos, kn))) + kernfs_activate_one(pos); - WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); - WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS); + up_write(&root->kernfs_rwsem); +} - atomic_sub(KN_DEACTIVATED_BIAS, &pos->active); - pos->flags |= KERNFS_ACTIVATED; +/** + * kernfs_show - show or hide a node + * @kn: kernfs_node to show or hide + * @show: whether to show or hide + * + * If @show is %false, @kn is marked hidden and deactivated. A hidden node is + * ignored in future activaitons. If %true, the mark is removed and activation + * state is restored. This function won't implicitly activate a new node in a + * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. + * + * To avoid recursion complexities, directories aren't supported for now. + */ +void kernfs_show(struct kernfs_node *kn, bool show) +{ + struct kernfs_root *root = kernfs_root(kn); + + if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) + return; + + down_write(&root->kernfs_rwsem); + + if (show) { + kn->flags &= ~KERNFS_HIDDEN; + if (kn->flags & KERNFS_ACTIVATED) + kernfs_activate_one(kn); + } else { + kn->flags |= KERNFS_HIDDEN; + if (kernfs_active(kn)) + atomic_add(KN_DEACTIVATED_BIAS, &kn->active); + kernfs_drain(kn); } up_write(&root->kernfs_rwsem); @@ -1358,34 +1401,27 @@ static void __kernfs_remove(struct kernfs_node *kn) pr_debug("kernfs %s: removing\n", kn->name); - /* prevent any new usage under @kn by deactivating all nodes */ + /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; - while ((pos = kernfs_next_descendant_post(pos, kn))) + while ((pos = kernfs_next_descendant_post(pos, kn))) { + pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); + } /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* - * kernfs_drain() drops kernfs_rwsem temporarily and @pos's + * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); - /* - * Drain iff @kn was activated. This avoids draining and - * its lockdep annotations for nodes which have never been - * activated and allows embedding kernfs_remove() in create - * error paths without worrying about draining. - */ - if (kn->flags & KERNFS_ACTIVATED) - kernfs_drain(pos); - else - WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + kernfs_drain(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it @@ -1585,8 +1621,11 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); - if (kn) + if (kn) { + kernfs_get(kn); __kernfs_remove(kn); + kernfs_put(kn); + } up_write(&root->kernfs_rwsem); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index b3ec34386b43..9ab6c92e02da 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -23,6 +23,8 @@ struct kernfs_open_node { atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ + unsigned int nr_mmapped; + unsigned int nr_to_release; }; /* @@ -57,31 +59,17 @@ static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) } /** - * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn. - * - * @of: associated kernfs_open_file instance. - * @kn: target kernfs_node. - * - * Fetch and return ->attr.open of @kn if @of->list is non empty. - * If @of->list is not empty we can safely assume that @of is on - * @kn->attr.open->files list and this guarantees that @kn->attr.open - * will not vanish i.e. dereferencing outside RCU read-side critical - * section is safe here. - * - * The caller needs to make sure that @of->list is not empty. + * of_on - Return the kernfs_open_node of the specified kernfs_open_file + * @of: taret kernfs_open_file */ -static struct kernfs_open_node * -kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn) +static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { - struct kernfs_open_node *on; - - on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list)); - - return on; + return rcu_dereference_protected(of->kn->attr.open, + !list_empty(&of->list)); } /** - * kernfs_deref_open_node_protected - Get kernfs_open_node corresponding to @kn + * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * @@ -96,7 +84,7 @@ kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn) * The caller needs to make sure that kernfs_open_file_mutex is held. */ static struct kernfs_open_node * -kernfs_deref_open_node_protected(struct kernfs_node *kn) +kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); @@ -207,12 +195,8 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v) static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; - struct kernfs_open_node *on = kernfs_deref_open_node(of, of->kn); - - if (!on) - return -EINVAL; - of->event = atomic_read(&on->event); + of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } @@ -235,7 +219,6 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; - struct kernfs_open_node *on; char *buf; buf = of->prealloc_buf; @@ -257,14 +240,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_free; } - on = kernfs_deref_open_node(of, of->kn); - if (!on) { - len = -EINVAL; - mutex_unlock(&of->mutex); - goto out_free; - } - - of->event = atomic_read(&on->event); + of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) @@ -553,6 +529,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) rc = 0; of->mmapped = true; + of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: @@ -580,31 +557,30 @@ out_unlock: static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { - struct kernfs_open_node *on, *new_on = NULL; - struct mutex *mutex = NULL; + struct kernfs_open_node *on; + struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); - on = kernfs_deref_open_node_protected(kn); + on = kernfs_deref_open_node_locked(kn); - if (on) { - list_add_tail(&of->list, &on->files); - mutex_unlock(mutex); - return 0; - } else { + if (!on) { /* not there, initialize a new one */ - new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); - if (!new_on) { + on = kzalloc(sizeof(*on), GFP_KERNEL); + if (!on) { mutex_unlock(mutex); return -ENOMEM; } - atomic_set(&new_on->event, 1); - init_waitqueue_head(&new_on->poll); - INIT_LIST_HEAD(&new_on->files); - list_add_tail(&of->list, &new_on->files); - rcu_assign_pointer(kn->attr.open, new_on); + atomic_set(&on->event, 1); + init_waitqueue_head(&on->poll); + INIT_LIST_HEAD(&on->files); + rcu_assign_pointer(kn->attr.open, on); } - mutex_unlock(mutex); + list_add_tail(&of->list, &on->files); + if (kn->flags & KERNFS_HAS_RELEASE) + on->nr_to_release++; + + mutex_unlock(mutex); return 0; } @@ -613,6 +589,7 @@ static int kernfs_get_open_node(struct kernfs_node *kn, * * @kn: target kernfs_node * @of: associated kernfs_open_file + * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free @@ -622,21 +599,30 @@ static int kernfs_get_open_node(struct kernfs_node *kn, * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, - struct kernfs_open_file *of) + struct kernfs_open_file *of, + bool open_failed) { struct kernfs_open_node *on; - struct mutex *mutex = NULL; + struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); - on = kernfs_deref_open_node_protected(kn); + on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } - if (of) + if (of) { + if (kn->flags & KERNFS_HAS_RELEASE) { + WARN_ON_ONCE(of->released == open_failed); + if (open_failed) + on->nr_to_release--; + } + if (of->mmapped) + on->nr_mmapped--; list_del(&of->list); + } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); @@ -763,7 +749,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) return 0; err_put_node: - kernfs_unlink_open_file(kn, of); + kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: @@ -795,6 +781,7 @@ static void kernfs_release_file(struct kernfs_node *kn, */ kn->attr.ops->release(of); of->released = true; + of_on(of)->nr_to_release--; } } @@ -802,15 +789,16 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); - struct mutex *mutex = NULL; if (kn->flags & KERNFS_HAS_RELEASE) { + struct mutex *mutex; + mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } - kernfs_unlink_open_file(kn, of); + kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); @@ -818,28 +806,33 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) return 0; } -void kernfs_drain_open_files(struct kernfs_node *kn) +bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; - struct kernfs_open_file *of; - struct mutex *mutex = NULL; - - if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE))) - return; + bool ret; /* - * lockless opportunistic check is safe below because no one is adding to - * ->attr.open at this point of time. This check allows early bail out - * if ->attr.open is already NULL. kernfs_unlink_open_file makes - * ->attr.open NULL only while holding kernfs_open_file_mutex so below - * check under kernfs_open_file_mutex_ptr(kn) will ensure bailing out if - * ->attr.open became NULL while waiting for the mutex. + * @kn being deactivated guarantees that @kn->attr.open can't change + * beneath us making the lockless test below safe. */ - if (!rcu_access_pointer(kn->attr.open)) - return; + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + rcu_read_lock(); + on = rcu_dereference(kn->attr.open); + ret = on && (on->nr_mmapped || on->nr_to_release); + rcu_read_unlock(); + + return ret; +} + +void kernfs_drain_open_files(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + struct kernfs_open_file *of; + struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); - on = kernfs_deref_open_node_protected(kn); + on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; @@ -848,13 +841,17 @@ void kernfs_drain_open_files(struct kernfs_node *kn) list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); - if (kn->flags & KERNFS_HAS_MMAP) + if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); + of->mmapped = false; + on->nr_mmapped--; + } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } + WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } @@ -874,11 +871,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { - struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); - struct kernfs_open_node *on = kernfs_deref_open_node(of, kn); - - if (!on) - return EPOLLERR; + struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 3ae214d02d44..fc5821effd97 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -157,6 +157,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, */ extern const struct file_operations kernfs_file_fops; +bool kernfs_should_drain_open_files(struct kernfs_node *kn); void kernfs_drain_open_files(struct kernfs_node *kn); /* diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index c5a5c7b90d72..2a39ffb8423b 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -424,6 +424,9 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, NTLMSSP_NEGOTIATE_56); } + if (cflags & NTLMSSP_NEGOTIATE_SEAL && smb3_encryption_negotiated(conn)) + flags |= NTLMSSP_NEGOTIATE_SEAL; + if (cflags & NTLMSSP_NEGOTIATE_ALWAYS_SIGN) flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; @@ -984,13 +987,16 @@ out: return rc; } -static int ksmbd_get_encryption_key(struct ksmbd_conn *conn, __u64 ses_id, +static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, int enc, u8 *key) { struct ksmbd_session *sess; u8 *ses_enc_key; - sess = ksmbd_session_lookup_all(conn, ses_id); + if (enc) + sess = work->sess; + else + sess = ksmbd_session_lookup_all(work->conn, ses_id); if (!sess) return -EINVAL; @@ -1078,9 +1084,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, return sg; } -int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, +int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov, unsigned int nvec, int enc) { + struct ksmbd_conn *conn = work->conn; struct smb2_transform_hdr *tr_hdr = smb2_get_msg(iov[0].iov_base); unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; int rc; @@ -1094,7 +1101,7 @@ int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); struct ksmbd_crypto_ctx *ctx; - rc = ksmbd_get_encryption_key(conn, + rc = ksmbd_get_encryption_key(work, le64_to_cpu(tr_hdr->SessionId), enc, key); diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h index 25b772653de0..362b6159a6cf 100644 --- a/fs/ksmbd/auth.h +++ b/fs/ksmbd/auth.h @@ -33,9 +33,10 @@ struct ksmbd_session; struct ksmbd_conn; +struct ksmbd_work; struct kvec; -int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, +int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov, unsigned int nvec, int enc); void ksmbd_copy_gss_neg_header(void *buf); int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess, diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 756ad631c019..12be8386446a 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -60,6 +60,12 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) conn->local_nls = load_nls("utf8"); if (!conn->local_nls) conn->local_nls = load_nls_default(); + if (IS_ENABLED(CONFIG_UNICODE)) + conn->um = utf8_load(UNICODE_AGE(12, 1, 0)); + else + conn->um = ERR_PTR(-EOPNOTSUPP); + if (IS_ERR(conn->um)) + conn->um = NULL; atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; @@ -350,6 +356,8 @@ out: wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0); + if (IS_ENABLED(CONFIG_UNICODE)) + utf8_unload(conn->um); unload_nls(conn->local_nls); if (default_conn_ops.terminate_fn) default_conn_ops.terminate_fn(conn); diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index e7f7d5707951..3643354a3fa7 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -14,6 +14,7 @@ #include <net/request_sock.h> #include <linux/kthread.h> #include <linux/nls.h> +#include <linux/unicode.h> #include "smb_common.h" #include "ksmbd_work.h" @@ -46,6 +47,7 @@ struct ksmbd_conn { char *request_buf; struct ksmbd_transport *transport; struct nls_table *local_nls; + struct unicode_map *um; struct list_head conns_list; /* smb session 1 per user */ struct xarray sessions; diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index e0cbcfa98c7e..ff07c67f4565 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -163,7 +163,8 @@ struct ksmbd_share_config_response { __u16 force_directory_mode; __u16 force_uid; __u16 force_gid; - __u32 reserved[128]; /* Reserved room */ + __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; + __u32 reserved[112]; /* Reserved room */ __u32 veto_list_sz; __s8 ____payload[]; }; diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c index c9bca1c2c834..328a412259dc 100644 --- a/fs/ksmbd/mgmt/share_config.c +++ b/fs/ksmbd/mgmt/share_config.c @@ -16,6 +16,7 @@ #include "user_config.h" #include "user_session.h" #include "../transport_ipc.h" +#include "../misc.h" #define SHARE_HASH_BITS 3 static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS); @@ -26,7 +27,7 @@ struct ksmbd_veto_pattern { struct list_head list; }; -static unsigned int share_name_hash(char *name) +static unsigned int share_name_hash(const char *name) { return jhash(name, strlen(name), 0); } @@ -72,7 +73,7 @@ __get_share_config(struct ksmbd_share_config *share) return share; } -static struct ksmbd_share_config *__share_lookup(char *name) +static struct ksmbd_share_config *__share_lookup(const char *name) { struct ksmbd_share_config *share; unsigned int key = share_name_hash(name); @@ -119,7 +120,8 @@ static int parse_veto_list(struct ksmbd_share_config *share, return 0; } -static struct ksmbd_share_config *share_config_request(char *name) +static struct ksmbd_share_config *share_config_request(struct unicode_map *um, + const char *name) { struct ksmbd_share_config_response *resp; struct ksmbd_share_config *share = NULL; @@ -133,6 +135,19 @@ static struct ksmbd_share_config *share_config_request(char *name) if (resp->flags == KSMBD_SHARE_FLAG_INVALID) goto out; + if (*resp->share_name) { + char *cf_resp_name; + bool equal; + + cf_resp_name = ksmbd_casefold_sharename(um, resp->share_name); + if (IS_ERR(cf_resp_name)) + goto out; + equal = !strcmp(cf_resp_name, name); + kfree(cf_resp_name); + if (!equal) + goto out; + } + share = kzalloc(sizeof(struct ksmbd_share_config), GFP_KERNEL); if (!share) goto out; @@ -190,20 +205,11 @@ out: return share; } -static void strtolower(char *share_name) -{ - while (*share_name) { - *share_name = tolower(*share_name); - share_name++; - } -} - -struct ksmbd_share_config *ksmbd_share_config_get(char *name) +struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, + const char *name) { struct ksmbd_share_config *share; - strtolower(name); - down_read(&shares_table_lock); share = __share_lookup(name); if (share) @@ -212,7 +218,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(char *name) if (share) return share; - return share_config_request(name); + return share_config_request(um, name); } bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h index 902f2cb1963a..3fd338293942 100644 --- a/fs/ksmbd/mgmt/share_config.h +++ b/fs/ksmbd/mgmt/share_config.h @@ -9,6 +9,7 @@ #include <linux/workqueue.h> #include <linux/hashtable.h> #include <linux/path.h> +#include <linux/unicode.h> struct ksmbd_share_config { char *name; @@ -74,7 +75,8 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share) __ksmbd_share_config_put(share); } -struct ksmbd_share_config *ksmbd_share_config_get(char *name); +struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, + const char *name); bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, const char *filename); #endif /* __SHARE_CONFIG_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c index 97ab7987df6e..8ce17b3fb8da 100644 --- a/fs/ksmbd/mgmt/tree_connect.c +++ b/fs/ksmbd/mgmt/tree_connect.c @@ -17,7 +17,7 @@ struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - char *share_name) + const char *share_name) { struct ksmbd_tree_conn_status status = {-ENOENT, NULL}; struct ksmbd_tree_connect_response *resp = NULL; @@ -26,7 +26,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, struct sockaddr *peer_addr; int ret; - sc = ksmbd_share_config_get(share_name); + sc = ksmbd_share_config_get(conn->um, share_name); if (!sc) return status; @@ -61,7 +61,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, struct ksmbd_share_config *new_sc; ksmbd_share_config_del(sc); - new_sc = ksmbd_share_config_get(share_name); + new_sc = ksmbd_share_config_get(conn->um, share_name); if (!new_sc) { pr_err("Failed to update stale share config\n"); status.ret = -ESTALE; diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h index 71e50271dccf..0f97ddc1e39c 100644 --- a/fs/ksmbd/mgmt/tree_connect.h +++ b/fs/ksmbd/mgmt/tree_connect.h @@ -42,7 +42,7 @@ struct ksmbd_session; struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - char *share_name); + const char *share_name); int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *tree_conn); diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index df991107ad2c..9e8afaa686e3 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/xattr.h> #include <linux/fs.h> +#include <linux/unicode.h> #include "misc.h" #include "smb_common.h" @@ -159,7 +160,7 @@ out: */ char *convert_to_nt_pathname(struct ksmbd_share_config *share, - struct path *path) + const struct path *path) { char *pathname, *ab_pathname, *nt_pathname; int share_path_len = share->path_sz; @@ -226,26 +227,53 @@ void ksmbd_conv_path_to_windows(char *path) strreplace(path, '/', '\\'); } +char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name) +{ + char *cf_name; + int cf_len; + + cf_name = kzalloc(KSMBD_REQ_MAX_SHARE_NAME, GFP_KERNEL); + if (!cf_name) + return ERR_PTR(-ENOMEM); + + if (IS_ENABLED(CONFIG_UNICODE) && um) { + const struct qstr q_name = {.name = name, .len = strlen(name)}; + + cf_len = utf8_casefold(um, &q_name, cf_name, + KSMBD_REQ_MAX_SHARE_NAME); + if (cf_len < 0) + goto out_ascii; + + return cf_name; + } + +out_ascii: + cf_len = strscpy(cf_name, name, KSMBD_REQ_MAX_SHARE_NAME); + if (cf_len < 0) { + kfree(cf_name); + return ERR_PTR(-E2BIG); + } + + for (; *cf_name; ++cf_name) + *cf_name = isascii(*cf_name) ? tolower(*cf_name) : *cf_name; + return cf_name - cf_len; +} + /** * ksmbd_extract_sharename() - get share name from tree connect request * @treename: buffer containing tree name and share name * * Return: share name on success, otherwise error */ -char *ksmbd_extract_sharename(char *treename) +char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename) { - char *name = treename; - char *dst; - char *pos = strrchr(name, '\\'); + const char *name = treename, *pos = strrchr(name, '\\'); if (pos) name = (pos + 1); /* caller has to free the memory */ - dst = kstrdup(name, GFP_KERNEL); - if (!dst) - return ERR_PTR(-ENOMEM); - return dst; + return ksmbd_casefold_sharename(um, name); } /** diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h index aae2a252945f..1facfcd21200 100644 --- a/fs/ksmbd/misc.h +++ b/fs/ksmbd/misc.h @@ -15,12 +15,13 @@ int match_pattern(const char *str, size_t len, const char *pattern); int ksmbd_validate_filename(char *filename); int parse_stream_name(char *filename, char **stream_name, int *s_type); char *convert_to_nt_pathname(struct ksmbd_share_config *share, - struct path *path); + const struct path *path); int get_nlink(struct kstat *st); void ksmbd_conv_path_to_unix(char *path); void ksmbd_strip_last_slash(char *path); void ksmbd_conv_path_to_windows(char *path); -char *ksmbd_extract_sharename(char *treename); +char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name); +char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename); char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name); #define KSMBD_DIR_INFO_ALIGNMENT 8 diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 5052be9261d9..0ae8d08d85a8 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -345,6 +345,8 @@ int ndr_encode_posix_acl(struct ndr *n, { unsigned int ref_id = 0x00020000; int ret; + vfsuid_t vfsuid; + vfsgid_t vfsgid; n->offset = 0; n->length = 1024; @@ -372,10 +374,12 @@ int ndr_encode_posix_acl(struct ndr *n, if (ret) return ret; - ret = ndr_write_int64(n, from_kuid(&init_user_ns, i_uid_into_mnt(user_ns, inode))); + vfsuid = i_uid_into_vfsuid(user_ns, inode); + ret = ndr_write_int64(n, from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid))); if (ret) return ret; - ret = ndr_write_int64(n, from_kgid(&init_user_ns, i_gid_into_mnt(user_ns, inode))); + vfsgid = i_gid_into_vfsgid(user_ns, inode); + ret = ndr_write_int64(n, from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid))); if (ret) return ret; ret = ndr_write_int32(n, inode->i_mode); diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 9046cff4374b..d7d47b82451d 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1609,12 +1609,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) struct create_posix_rsp *buf; struct inode *inode = file_inode(fp->filp); struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); buf = (struct create_posix_rsp *)cc; memset(buf, 0, sizeof(struct create_posix_rsp)); buf->ccontext.DataOffset = cpu_to_le16(offsetof (struct create_posix_rsp, nlink)); - buf->ccontext.DataLength = cpu_to_le32(52); + /* + * DataLength = nlink(4) + reparse_tag(4) + mode(4) + + * domain sid(28) + unix group sid(16). + */ + buf->ccontext.DataLength = cpu_to_le32(56); buf->ccontext.NameOffset = cpu_to_le16(offsetof (struct create_posix_rsp, Name)); buf->ccontext.NameLength = cpu_to_le16(POSIX_CTXT_DATA_LEN); @@ -1638,13 +1644,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) buf->nlink = cpu_to_le32(inode->i_nlink); buf->reparse_tag = cpu_to_le32(fp->volatile_id); - buf->mode = cpu_to_le32(inode->i_mode); - id_to_sid(from_kuid_munged(&init_user_ns, - i_uid_into_mnt(user_ns, inode)), - SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]); - id_to_sid(from_kgid_munged(&init_user_ns, - i_gid_into_mnt(user_ns, inode)), - SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]); + buf->mode = cpu_to_le32(inode->i_mode & 0777); + /* + * SidBuffer(44) contain two sids(Domain sid(28), UNIX group sid(16)). + * Domain sid(28) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 4(num_subauth)) + RID(4). + * UNIX group id(16) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 1(num_subauth)) + RID(4). + */ + id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)), + SIDOWNER, (struct smb_sid *)&buf->SidBuffer[0]); + id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)), + SIDUNIX_GROUP, (struct smb_sid *)&buf->SidBuffer[28]); } /* diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index ce42bff42ef9..a0d635304754 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -235,10 +235,8 @@ send: if (work->sess && work->sess->enc && work->encrypted && conn->ops->encrypt_resp) { rc = conn->ops->encrypt_resp(work); - if (rc < 0) { + if (rc < 0) conn->ops->set_rsp_status(work, STATUS_DATA_ERROR); - goto send; - } } ksmbd_conn_write(work); diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index bfa6b41d895b..b2fc85d440d0 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -925,7 +925,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn, * * Return: true if connection should be encrypted, else false */ -static bool smb3_encryption_negotiated(struct ksmbd_conn *conn) +bool smb3_encryption_negotiated(struct ksmbd_conn *conn) { if (!conn->ops->generate_encryptionkey) return false; @@ -1883,7 +1883,7 @@ int smb2_tree_connect(struct ksmbd_work *work) goto out_err1; } - name = ksmbd_extract_sharename(treename); + name = ksmbd_extract_sharename(conn->um, treename); if (IS_ERR(name)) { status.ret = KSMBD_TREE_CONN_STATUS_ERROR; goto out_err1; @@ -2185,7 +2185,7 @@ out: * Return: 0 on success, otherwise error */ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, - struct path *path) + const struct path *path) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); char *attr_name = NULL, *value; @@ -2272,7 +2272,7 @@ next: return rc; } -static noinline int smb2_set_stream_name_xattr(struct path *path, +static noinline int smb2_set_stream_name_xattr(const struct path *path, struct ksmbd_file *fp, char *stream_name, int s_type) { @@ -2311,7 +2311,7 @@ static noinline int smb2_set_stream_name_xattr(struct path *path, return 0; } -static int smb2_remove_smb_xattrs(struct path *path) +static int smb2_remove_smb_xattrs(const struct path *path) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); char *name, *xattr_list = NULL; @@ -2345,7 +2345,7 @@ out: return err; } -static int smb2_create_truncate(struct path *path) +static int smb2_create_truncate(const struct path *path) { int rc = vfs_truncate(path, 0); @@ -2364,7 +2364,7 @@ static int smb2_create_truncate(struct path *path) return rc; } -static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path, +static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path *path, struct ksmbd_file *fp) { struct xattr_dos_attrib da = {0}; @@ -2387,7 +2387,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path, } static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, - struct path *path, struct ksmbd_file *fp) + const struct path *path, struct ksmbd_file *fp) { struct xattr_dos_attrib da; int rc; @@ -2447,7 +2447,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, static int smb2_create_sd_buffer(struct ksmbd_work *work, struct smb2_create_req *req, - struct path *path) + const struct path *path) { struct create_context *context; struct create_sd_buf_req *sd_buf; @@ -2477,8 +2477,11 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr, struct user_namespace *mnt_userns, struct inode *inode) { - fattr->cf_uid = i_uid_into_mnt(mnt_userns, inode); - fattr->cf_gid = i_gid_into_mnt(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + + fattr->cf_uid = vfsuid_into_kuid(vfsuid); + fattr->cf_gid = vfsgid_into_kgid(vfsgid); fattr->cf_mode = inode->i_mode; fattr->cf_acls = NULL; fattr->cf_dacls = NULL; @@ -2761,7 +2764,6 @@ int smb2_open(struct ksmbd_work *work) } else { file_present = true; user_ns = mnt_user_ns(path.mnt); - generic_fillattr(user_ns, d_inode(path.dentry), &stat); } if (stream_name) { if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { @@ -2770,7 +2772,8 @@ int smb2_open(struct ksmbd_work *work) rsp->hdr.Status = STATUS_NOT_A_DIRECTORY; } } else { - if (S_ISDIR(stat.mode) && s_type == DATA_STREAM) { + if (file_present && S_ISDIR(d_inode(path.dentry)->i_mode) && + s_type == DATA_STREAM) { rc = -EIO; rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY; } @@ -2787,7 +2790,8 @@ int smb2_open(struct ksmbd_work *work) } if (file_present && req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE && - S_ISDIR(stat.mode) && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { + S_ISDIR(d_inode(path.dentry)->i_mode) && + !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { ksmbd_debug(SMB, "open() argument is a directory: %s, %x\n", name, req->CreateOptions); rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY; @@ -2797,7 +2801,7 @@ int smb2_open(struct ksmbd_work *work) if (file_present && (req->CreateOptions & FILE_DIRECTORY_FILE_LE) && !(req->CreateDisposition == FILE_CREATE_LE) && - !S_ISDIR(stat.mode)) { + !S_ISDIR(d_inode(path.dentry)->i_mode)) { rsp->hdr.Status = STATUS_NOT_A_DIRECTORY; rc = -EIO; goto err_out; @@ -3561,17 +3565,22 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, posix_info->AllocationSize = cpu_to_le64(ksmbd_kstat->kstat->blocks << 9); posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev); posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink); - posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode); + posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode & 0777); posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino); posix_info->DosAttributes = S_ISDIR(ksmbd_kstat->kstat->mode) ? FILE_ATTRIBUTE_DIRECTORY_LE : FILE_ATTRIBUTE_ARCHIVE_LE; if (d_info->hide_dot_file && d_info->name[0] == '.') posix_info->DosAttributes |= FILE_ATTRIBUTE_HIDDEN_LE; + /* + * SidBuffer(32) contain two sids(Domain sid(16), UNIX group sid(16)). + * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 1(num_subauth)) + RID(4). + */ id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid), - SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]); + SIDUNIX_USER, (struct smb_sid *)&posix_info->SidBuffer[0]); id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid), - SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]); + SIDUNIX_GROUP, (struct smb_sid *)&posix_info->SidBuffer[16]); memcpy(posix_info->name, conv_name, conv_len); posix_info->name_len = cpu_to_le32(conv_len); posix_info->NextEntryOffset = cpu_to_le32(next_entry_offset); @@ -3806,11 +3815,6 @@ static bool __query_dir(struct dir_context *ctx, const char *name, int namlen, return true; } -static void restart_ctx(struct dir_context *ctx) -{ - ctx->pos = 0; -} - static int verify_info_level(int info_level) { switch (info_level) { @@ -3892,8 +3896,7 @@ int smb2_query_dir(struct ksmbd_work *work) inode_permission(file_mnt_user_ns(dir_fp->filp), file_inode(dir_fp->filp), MAY_READ | MAY_EXEC)) { - pr_err("no right to enumerate directory (%pd)\n", - dir_fp->filp->f_path.dentry); + pr_err("no right to enumerate directory (%pD)\n", dir_fp->filp); rc = -EACCES; goto err_out2; } @@ -3919,7 +3922,6 @@ int smb2_query_dir(struct ksmbd_work *work) if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) { ksmbd_debug(SMB, "Restart directory scan\n"); generic_file_llseek(dir_fp->filp, 0, SEEK_SET); - restart_ctx(&dir_fp->readdir_data.ctx); } memset(&d_info, 0, sizeof(struct ksmbd_dir_info)); @@ -3966,11 +3968,9 @@ int smb2_query_dir(struct ksmbd_work *work) */ if (!d_info.out_buf_len && !d_info.num_entry) goto no_buf_len; - if (rc == 0) - restart_ctx(&dir_fp->readdir_data.ctx); - if (rc == -ENOSPC) + if (rc > 0 || rc == -ENOSPC) rc = 0; - if (rc) + else if (rc) goto err_out; d_info.wptr = d_info.rptr; @@ -4027,6 +4027,8 @@ err_out2: rsp->hdr.Status = STATUS_NO_MEMORY; else if (rc == -EFAULT) rsp->hdr.Status = STATUS_INVALID_INFO_CLASS; + else if (rc == -EIO) + rsp->hdr.Status = STATUS_FILE_CORRUPT_ERROR; if (!rsp->hdr.Status) rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; @@ -4156,7 +4158,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, int rc, name_len, value_len, xattr_list_len, idx; ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0; struct smb2_ea_info_req *ea_req = NULL; - struct path *path; + const struct path *path; struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); if (!(fp->daccess & FILE_READ_EA_LE)) { @@ -4493,7 +4495,7 @@ static void get_file_stream_info(struct ksmbd_work *work, struct smb2_file_stream_info *file_info; char *stream_name, *xattr_list = NULL, *stream_buf; struct kstat stat; - struct path *path = &fp->filp->f_path; + const struct path *path = &fp->filp->f_path; ssize_t xattr_list_len; int nbytes = 0, streamlen, stream_name_len, next, idx = 0; int buf_free_len; @@ -4718,7 +4720,11 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, { struct smb311_posix_qinfo *file_info; struct inode *inode = file_inode(fp->filp); + struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); u64 time; + int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32; file_info = (struct smb311_posix_qinfo *)rsp->Buffer; file_info->CreationTime = cpu_to_le64(fp->create_time); @@ -4733,12 +4739,22 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->EndOfFile = cpu_to_le64(inode->i_size); file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9); file_info->HardLinks = cpu_to_le32(inode->i_nlink); - file_info->Mode = cpu_to_le32(inode->i_mode); + file_info->Mode = cpu_to_le32(inode->i_mode & 0777); file_info->DeviceId = cpu_to_le32(inode->i_rdev); - rsp->OutputBufferLength = - cpu_to_le32(sizeof(struct smb311_posix_qinfo)); - inc_rfc1001_len(rsp_org, sizeof(struct smb311_posix_qinfo)); - return 0; + + /* + * Sids(32) contain two sids(Domain sid(16), UNIX group sid(16)). + * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 1(num_subauth)) + RID(4). + */ + id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)), + SIDUNIX_USER, (struct smb_sid *)&file_info->Sids[0]); + id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)), + SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]); + + rsp->OutputBufferLength = cpu_to_le32(out_buf_len); + inc_rfc1001_len(rsp_org, out_buf_len); + return out_buf_len; } static int smb2_get_info_file(struct ksmbd_work *work, @@ -4858,8 +4874,8 @@ static int smb2_get_info_file(struct ksmbd_work *work, pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); rc = -EOPNOTSUPP; } else { - rc = find_file_posix_info(rsp, fp, work->response_buf); - file_infoclass_size = sizeof(struct smb311_posix_qinfo); + file_infoclass_size = find_file_posix_info(rsp, fp, + work->response_buf); } break; default: @@ -5411,7 +5427,7 @@ static int smb2_rename(struct ksmbd_work *work, if (!pathname) return -ENOMEM; - abs_oldname = d_path(&fp->filp->f_path, pathname, PATH_MAX); + abs_oldname = file_path(fp->filp, pathname, PATH_MAX); if (IS_ERR(abs_oldname)) { rc = -EINVAL; goto out; @@ -5546,7 +5562,7 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "link name is %s\n", link_name); - target_name = d_path(&filp->f_path, pathname, PATH_MAX); + target_name = file_path(filp, pathname, PATH_MAX); if (IS_ERR(target_name)) { rc = -EINVAL; goto out; @@ -6264,8 +6280,8 @@ int smb2_read(struct ksmbd_work *work) goto out; } - ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", - fp->filp->f_path.dentry, offset, length); + ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", + fp->filp, offset, length); work->aux_payload_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!work->aux_payload_buf) { @@ -6529,8 +6545,8 @@ int smb2_write(struct ksmbd_work *work) data_buf = (char *)(((char *)&req->hdr.ProtocolId) + le16_to_cpu(req->DataOffset)); - ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", - fp->filp->f_path.dentry, offset, length); + ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", + fp->filp, offset, length); err = ksmbd_vfs_write(work, fp, data_buf, length, &offset, writethrough, &nbytes); if (err < 0) @@ -7641,11 +7657,16 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } - if (in_buf_len < sizeof(struct validate_negotiate_info_req)) - return -EINVAL; + if (in_buf_len < offsetof(struct validate_negotiate_info_req, + Dialects)) { + ret = -EINVAL; + goto out; + } - if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) - return -EINVAL; + if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) { + ret = -EINVAL; + goto out; + } ret = fsctl_validate_negotiate_info(conn, (struct validate_negotiate_info_req *)&req->Buffer[0], @@ -8571,7 +8592,7 @@ int smb3_encrypt_resp(struct ksmbd_work *work) buf_size += iov[1].iov_len; work->resp_hdr_sz = iov[1].iov_len; - rc = ksmbd_crypt_message(work->conn, iov, rq_nvec, 1); + rc = ksmbd_crypt_message(work, iov, rq_nvec, 1); if (rc) return rc; @@ -8590,7 +8611,6 @@ bool smb3_is_transform_hdr(void *buf) int smb3_decrypt_req(struct ksmbd_work *work) { - struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess; char *buf = work->request_buf; unsigned int pdu_length = get_rfc1002_len(buf); @@ -8610,7 +8630,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) return -ECONNABORTED; } - sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId)); + sess = ksmbd_session_lookup_all(work->conn, le64_to_cpu(tr_hdr->SessionId)); if (!sess) { pr_err("invalid session id(%llx) in transform header\n", le64_to_cpu(tr_hdr->SessionId)); @@ -8621,7 +8641,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4; iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr) + 4; iov[1].iov_len = buf_data_size; - rc = ksmbd_crypt_message(conn, iov, 2, 0); + rc = ksmbd_crypt_message(work, iov, 2, 0); if (rc) return rc; diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index af455278d005..092fdd3f8750 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -158,7 +158,8 @@ struct create_posix_rsp { __le32 nlink; __le32 reparse_tag; __le32 mode; - u8 SidBuffer[40]; + /* SidBuffer contain two sids(Domain sid(28), UNIX group sid(16)) */ + u8 SidBuffer[44]; } __packed; struct smb2_buffer_desc_v1 { @@ -439,7 +440,8 @@ struct smb2_posix_info { __le32 HardLinks; __le32 ReparseTag; __le32 Mode; - u8 SidBuffer[40]; + /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */ + u8 SidBuffer[32]; __le32 name_len; u8 name[1]; /* @@ -492,6 +494,7 @@ int smb3_decrypt_req(struct ksmbd_work *work); int smb3_encrypt_resp(struct ksmbd_work *work); bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work); int smb2_set_rsp_credits(struct ksmbd_work *work); +bool smb3_encryption_negotiated(struct ksmbd_conn *conn); /* smb2 misc functions */ int ksmbd_smb2_check_message(struct ksmbd_work *work); diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 7f8ab14fb8ec..d96da872d70a 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -4,6 +4,8 @@ * Copyright (C) 2018 Namjae Jeon <linkinjeon@kernel.org> */ +#include <linux/user_namespace.h> + #include "smb_common.h" #include "server.h" #include "misc.h" @@ -625,8 +627,8 @@ int ksmbd_override_fsids(struct ksmbd_work *work) if (!cred) return -ENOMEM; - cred->fsuid = make_kuid(current_user_ns(), uid); - cred->fsgid = make_kgid(current_user_ns(), gid); + cred->fsuid = make_kuid(&init_user_ns, uid); + cred->fsgid = make_kgid(&init_user_ns, gid); gi = groups_alloc(0); if (!gi) { diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 3781bca2c8fc..b05ff9b146b5 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -275,7 +275,8 @@ static int sid_to_id(struct user_namespace *user_ns, uid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - uid = mapped_kuid_user(user_ns, &init_user_ns, KUIDT_INIT(id)); + uid = KUIDT_INIT(id); + uid = from_vfsuid(user_ns, &init_user_ns, VFSUIDT_INIT(uid)); if (uid_valid(uid)) { fattr->cf_uid = uid; rc = 0; @@ -285,7 +286,8 @@ static int sid_to_id(struct user_namespace *user_ns, gid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - gid = mapped_kgid_user(user_ns, &init_user_ns, KGIDT_INIT(id)); + gid = KGIDT_INIT(id); + gid = from_vfsgid(user_ns, &init_user_ns, VFSGIDT_INIT(gid)); if (gid_valid(gid)) { fattr->cf_gid = gid; rc = 0; @@ -991,7 +993,7 @@ static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type, } int smb_inherit_dacl(struct ksmbd_conn *conn, - struct path *path, + const struct path *path, unsigned int uid, unsigned int gid) { const struct smb_sid *psid, *creator = NULL; @@ -1185,7 +1187,7 @@ bool smb_inherit_flags(int flags, bool is_dir) return false; } -int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, +int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); @@ -1352,7 +1354,7 @@ err_out: } int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, - struct path *path, struct smb_ntsd *pntsd, int ntsd_len, + const struct path *path, struct smb_ntsd *pntsd, int ntsd_len, bool type_check) { int rc; diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index fcb2c83f2992..618f2e0236b3 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -201,12 +201,12 @@ void posix_state_to_acl(struct posix_acl_state *state, struct posix_acl_entry *pace); int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid); bool smb_inherit_flags(int flags, bool is_dir); -int smb_inherit_dacl(struct ksmbd_conn *conn, struct path *path, +int smb_inherit_dacl(struct ksmbd_conn *conn, const struct path *path, unsigned int uid, unsigned int gid); -int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, +int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid); int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, - struct path *path, struct smb_ntsd *pntsd, int ntsd_len, + const struct path *path, struct smb_ntsd *pntsd, int ntsd_len, bool type_check); void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); void ksmbd_init_domain(u32 *sub_auth); @@ -214,25 +214,25 @@ void ksmbd_init_domain(u32 *sub_auth); static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, struct posix_acl_entry *pace) { - kuid_t kuid; + vfsuid_t vfsuid; /* If this is an idmapped mount, apply the idmapping. */ - kuid = mapped_kuid_fs(mnt_userns, &init_user_ns, pace->e_uid); + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pace->e_uid); /* Translate the kuid into a userspace id ksmbd would see. */ - return from_kuid(&init_user_ns, kuid); + return from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid)); } static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, struct posix_acl_entry *pace) { - kgid_t kgid; + vfsgid_t vfsgid; /* If this is an idmapped mount, apply the idmapping. */ - kgid = mapped_kgid_fs(mnt_userns, &init_user_ns, pace->e_gid); + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pace->e_gid); /* Translate the kgid into a userspace id ksmbd would see. */ - return from_kgid(&init_user_ns, kgid); + return from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid)); } #endif /* _SMBACL_H */ diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 35b55ee94fe5..096eda9ef873 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -32,7 +32,7 @@ /* SMB_DIRECT negotiation timeout in seconds */ #define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 -#define SMB_DIRECT_MAX_SEND_SGES 8 +#define SMB_DIRECT_MAX_SEND_SGES 6 #define SMB_DIRECT_MAX_RECV_SGES 1 /* @@ -62,13 +62,13 @@ static int smb_direct_receive_credit_max = 255; static int smb_direct_send_credit_target = 255; /* The maximum single message size can be sent to remote peer */ -static int smb_direct_max_send_size = 8192; +static int smb_direct_max_send_size = 1364; /* The maximum fragmented upper-layer payload receive size supported */ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ -static int smb_direct_max_receive_size = 8192; +static int smb_direct_max_receive_size = 1364; static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; @@ -1527,6 +1527,8 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, } case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DISCONNECTED: { + ib_drain_qp(t->qp); + t->status = SMB_DIRECT_CS_DISCONNECTED; wake_up_interruptible(&t->wait_status); wake_up_interruptible(&t->wait_reassembly_queue); diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 143bba4e4db8..63d55f543bd2 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -399,7 +399,8 @@ static int create_socket(struct interface *iface) ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); if (ret) { - pr_err("Can't create socket for ipv6, try ipv4: %d\n", ret); + if (ret != -EAFNOSUPPORT) + pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret); ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); if (ret) { diff --git a/fs/ksmbd/unicode.h b/fs/ksmbd/unicode.h index 5593024230ae..076f6034a789 100644 --- a/fs/ksmbd/unicode.h +++ b/fs/ksmbd/unicode.h @@ -24,6 +24,7 @@ #include <asm/byteorder.h> #include <linux/types.h> #include <linux/nls.h> +#include <linux/unicode.h> #define UNIUPR_NOLOWER /* Example to not expand lower case tables */ @@ -69,7 +70,7 @@ char *smb_strndup_from_utf16(const char *src, const int maxlen, const struct nls_table *codepage); int smbConvertToUTF16(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int mapchars); -char *ksmbd_extract_sharename(char *treename); +char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename); #endif /* diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 48b2b901f6e5..8de970d6146f 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -377,8 +377,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, if (work->conn->connection_type) { if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) { - pr_err("no right to read(%pd)\n", - fp->filp->f_path.dentry); + pr_err("no right to read(%pD)\n", fp->filp); return -EACCES; } } @@ -487,8 +486,7 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, if (work->conn->connection_type) { if (!(fp->daccess & FILE_WRITE_DATA_LE)) { - pr_err("no right to write(%pd)\n", - fp->filp->f_path.dentry); + pr_err("no right to write(%pD)\n", fp->filp); err = -EACCES; goto out; } @@ -527,8 +525,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, if (sync) { err = vfs_fsync_range(filp, offset, offset + *written, 0); if (err < 0) - pr_err("fsync failed for filename = %pd, err = %d\n", - fp->filp->f_path.dentry, err); + pr_err("fsync failed for filename = %pD, err = %d\n", + fp->filp, err); } out: @@ -543,7 +541,7 @@ out: * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_getattr(struct path *path, struct kstat *stat) +int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat) { int err; @@ -1145,12 +1143,23 @@ static bool __caseless_lookup(struct dir_context *ctx, const char *name, unsigned int d_type) { struct ksmbd_readdir_data *buf; + int cmp = -EINVAL; buf = container_of(ctx, struct ksmbd_readdir_data, ctx); if (buf->used != namlen) return true; - if (!strncasecmp((char *)buf->private, name, namlen)) { + if (IS_ENABLED(CONFIG_UNICODE) && buf->um) { + const struct qstr q_buf = {.name = buf->private, + .len = buf->used}; + const struct qstr q_name = {.name = name, + .len = namlen}; + + cmp = utf8_strncasecmp(buf->um, &q_buf, &q_name); + } + if (cmp < 0) + cmp = strncasecmp((char *)buf->private, name, namlen); + if (!cmp) { memcpy((char *)buf->private, name, namlen); buf->dirent_count = 1; return false; @@ -1166,7 +1175,8 @@ static bool __caseless_lookup(struct dir_context *ctx, const char *name, * * Return: 0 on success, otherwise error */ -static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen) +static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, + size_t namelen, struct unicode_map *um) { int ret; struct file *dfilp; @@ -1176,6 +1186,7 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen) .private = name, .used = namelen, .dirent_count = 0, + .um = um, }; dfilp = dentry_open(dir, flags, current_cred()); @@ -1238,7 +1249,8 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, break; err = ksmbd_vfs_lookup_in_dir(&parent, filename, - filename_len); + filename_len, + work->conn->um); path_put(&parent); if (err) goto out; @@ -1741,11 +1753,11 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, *total_size_written = 0; if (!(src_fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) { - pr_err("no right to read(%pd)\n", src_fp->filp->f_path.dentry); + pr_err("no right to read(%pD)\n", src_fp->filp); return -EACCES; } if (!(dst_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) { - pr_err("no right to write(%pd)\n", dst_fp->filp->f_path.dentry); + pr_err("no right to write(%pD)\n", dst_fp->filp); return -EACCES; } diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 70da4c0ba7ad..593059ca8511 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -12,6 +12,7 @@ #include <linux/namei.h> #include <uapi/linux/xattr.h> #include <linux/posix_acl.h> +#include <linux/unicode.h> #include "smbacl.h" #include "xattr.h" @@ -60,6 +61,7 @@ struct ksmbd_readdir_data { unsigned int used; unsigned int dirent_count; unsigned int file_attr; + struct unicode_map *um; }; /* ksmbd kstat wrapper to get valid create time when reading dir entry */ @@ -85,7 +87,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id); int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name); int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, const char *newname); -int ksmbd_vfs_getattr(struct path *path, struct kstat *stat); +int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat); int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname); int ksmbd_vfs_truncate(struct ksmbd_work *work, diff --git a/fs/libfs.c b/fs/libfs.c index 31b0ddf01c31..682d56345a1c 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -15,6 +15,7 @@ #include <linux/mutex.h> #include <linux/namei.h> #include <linux/exportfs.h> +#include <linux/iversion.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> @@ -1520,3 +1521,48 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); + +/** + * inode_maybe_inc_iversion - increments i_version + * @inode: inode with the i_version that should be updated + * @force: increment the counter even if it's not necessary? + * + * Every time the inode is modified, the i_version field must be seen to have + * changed by any observer. + * + * If "force" is set or the QUERIED flag is set, then ensure that we increment + * the value, and clear the queried flag. + * + * In the common case where neither is set, then we can return "false" without + * updating i_version. + * + * If this function returns false, and no other metadata has changed, then we + * can avoid logging the metadata. + */ +bool inode_maybe_inc_iversion(struct inode *inode, bool force) +{ + u64 cur, new; + + /* + * The i_version field is not strictly ordered with any other inode + * information, but the legacy inode_inc_iversion code used a spinlock + * to serialize increments. + * + * Here, we add full memory barriers to ensure that any de-facto + * ordering with other info is preserved. + * + * This barrier pairs with the barrier in inode_query_iversion() + */ + smp_mb(); + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is clear then we needn't do anything */ + if (!force && !(cur & I_VERSION_QUERIED)) + return false; + + /* Since lowest bit is flag, add 2 to avoid it */ + new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return true; +} +EXPORT_SYMBOL(inode_maybe_inc_iversion); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 937fa5fae2b8..8afdc408ca4f 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -53,16 +53,16 @@ static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, } static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { int error; struct inode *inode = minix_new_inode(dir, mode, &error); if (inode) { minix_set_inode(inode, 0); mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); } - return error; + return finish_open_simple(file, error); } static int minix_create(struct user_namespace *mnt_userns, struct inode *dir, diff --git a/fs/namei.c b/fs/namei.c index 8533087e5dac..578c2110df02 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3583,72 +3583,94 @@ static int do_open(struct nameidata *nd, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs init_user_ns. */ -struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, - struct dentry *dentry, umode_t mode, int open_flag) +static int vfs_tmpfile(struct user_namespace *mnt_userns, + const struct path *parentpath, + struct file *file, umode_t mode) { - struct dentry *child = NULL; - struct inode *dir = dentry->d_inode; + struct dentry *child; + struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; /* we want directory to be writable */ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) - goto out_err; - error = -EOPNOTSUPP; + return error; if (!dir->i_op->tmpfile) - goto out_err; - error = -ENOMEM; - child = d_alloc(dentry, &slash_name); + return -EOPNOTSUPP; + child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) - goto out_err; + return -ENOMEM; + file->f_path.mnt = parentpath->mnt; + file->f_path.dentry = child; mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); - error = dir->i_op->tmpfile(mnt_userns, dir, child, mode); + error = dir->i_op->tmpfile(mnt_userns, dir, file, mode); + dput(child); if (error) - goto out_err; - error = -ENOENT; - inode = child->d_inode; - if (unlikely(!inode)) - goto out_err; - if (!(open_flag & O_EXCL)) { + return error; + /* Don't check for other permissions, the inode was just created */ + error = may_open(mnt_userns, &file->f_path, 0, file->f_flags); + if (error) + return error; + inode = file_inode(file); + if (!(file->f_flags & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } ima_post_create_tmpfile(mnt_userns, inode); - return child; + return 0; +} -out_err: - dput(child); - return ERR_PTR(error); +/** + * vfs_tmpfile_open - open a tmpfile for kernel internal use + * @mnt_userns: user namespace of the mount the inode was found from + * @parentpath: path of the base directory + * @mode: mode of the new tmpfile + * @open_flag: flags + * @cred: credentials for open + * + * Create and open a temporary file. The file is not accounted in nr_files, + * hence this is only for kernel internal use, and must not be installed into + * file tables or such. + */ +struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, + const struct path *parentpath, + umode_t mode, int open_flag, const struct cred *cred) +{ + struct file *file; + int error; + + file = alloc_empty_file_noaccount(open_flag, cred); + if (!IS_ERR(file)) { + error = vfs_tmpfile(mnt_userns, parentpath, file, mode); + if (error) { + fput(file); + file = ERR_PTR(error); + } + } + return file; } -EXPORT_SYMBOL(vfs_tmpfile); +EXPORT_SYMBOL(vfs_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct user_namespace *mnt_userns; - struct dentry *child; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); + if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; mnt_userns = mnt_user_ns(path.mnt); - child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag); - error = PTR_ERR(child); - if (IS_ERR(child)) + error = vfs_tmpfile(mnt_userns, &path, file, op->mode); + if (error) goto out2; - dput(path.dentry); - path.dentry = child; - audit_inode(nd->name, child, 0); - /* Don't check for other permissions, the inode was just created */ - error = may_open(mnt_userns, &path, 0, op->open_flag); - if (!error) - error = vfs_open(&path, file); + audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: @@ -5088,7 +5110,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct page *page; - void *fsdata; + void *fsdata = NULL; int err; unsigned int flags; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index da8da5cdbbc1..f50e025ae406 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -280,7 +280,7 @@ EXPORT_SYMBOL_GPL(nfs_put_client); static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; - const struct sockaddr *sap = data->addr; + const struct sockaddr *sap = (struct sockaddr *)data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); int error; @@ -666,7 +666,7 @@ static int nfs_init_server(struct nfs_server *server, struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = ctx->nfs_server.hostname, - .addr = (const struct sockaddr *)&ctx->nfs_server.address, + .addr = &ctx->nfs_server._address, .addrlen = ctx->nfs_server.addrlen, .nfs_mod = ctx->nfs_mod, .proto = ctx->nfs_server.protocol, diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5c97cad741a7..ead8a0e06abf 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -228,8 +228,7 @@ again: * */ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, - fmode_t type, - const nfs4_stateid *stateid, + fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit) { struct nfs_delegation *delegation; @@ -239,25 +238,24 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation != NULL) { spin_lock(&delegation->lock); - if (nfs4_is_valid_delegation(delegation, 0)) { - nfs4_stateid_copy(&delegation->stateid, stateid); - delegation->type = type; - delegation->pagemod_limit = pagemod_limit; - oldcred = delegation->cred; - delegation->cred = get_cred(cred); - clear_bit(NFS_DELEGATION_NEED_RECLAIM, - &delegation->flags); - spin_unlock(&delegation->lock); - rcu_read_unlock(); - put_cred(oldcred); - trace_nfs4_reclaim_delegation(inode, type); - return; - } - /* We appear to have raced with a delegation return. */ + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; + oldcred = delegation->cred; + delegation->cred = get_cred(cred); + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + if (test_and_clear_bit(NFS_DELEGATION_REVOKED, + &delegation->flags)) + atomic_long_inc(&nfs_active_delegations); spin_unlock(&delegation->lock); + rcu_read_unlock(); + put_cred(oldcred); + trace_nfs4_reclaim_delegation(inode, type); + } else { + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, type, stateid, + pagemod_limit); } - rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 58036f657126..f594dac436a7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2489,9 +2489,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); goto out; } - if (dentry->d_fsdata) - /* old devname */ - kfree(dentry->d_fsdata); + /* old devname */ + kfree(dentry->d_fsdata); dentry->d_fsdata = NFS_FSDATA_BLOCKED; spin_unlock(&dentry->d_lock); diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index e87d500ad95a..6603b5cee029 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -16,8 +16,9 @@ #include "dns_resolve.h" ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, - struct sockaddr *sa, size_t salen) + struct sockaddr_storage *ss, size_t salen) { + struct sockaddr *sa = (struct sockaddr *)ss; ssize_t ret; char *ip_addr = NULL; int ip_len; @@ -341,7 +342,7 @@ out: } ssize_t nfs_dns_resolve_name(struct net *net, char *name, - size_t namelen, struct sockaddr *sa, size_t salen) + size_t namelen, struct sockaddr_storage *ss, size_t salen) { struct nfs_dns_ent key = { .hostname = name, @@ -354,7 +355,7 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item); if (ret == 0) { if (salen >= item->addrlen) { - memcpy(sa, &item->addr, item->addrlen); + memcpy(ss, &item->addr, item->addrlen); ret = item->addrlen; } else ret = -EOVERFLOW; diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h index 576ff4b54c82..fe3b172c4de1 100644 --- a/fs/nfs/dns_resolve.h +++ b/fs/nfs/dns_resolve.h @@ -32,6 +32,6 @@ extern void nfs_dns_resolver_cache_destroy(struct net *net); #endif extern ssize_t nfs_dns_resolve_name(struct net *net, char *name, - size_t namelen, struct sockaddr *sa, size_t salen); + size_t namelen, struct sockaddr_storage *sa, size_t salen); #endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e032fe201a36..d8ec889a4b3f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -567,7 +567,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) } wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); lock_page(page); mapping = page_file_mapping(page); @@ -655,9 +656,9 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } if (mntflags & NFS_MOUNT_WRITE_WAIT) { - result = filemap_fdatawait_range(file->f_mapping, - iocb->ki_pos - written, - iocb->ki_pos - 1); + filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); } result = generic_write_sync(iocb, written); if (result < 0) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7d285561e59f..1ec79ccf89ad 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -30,14 +30,20 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) #define FF_LAYOUTRETURN_MAXERR 20 +enum nfs4_ff_op_type { + NFS4_FF_OP_LAYOUTSTATS, + NFS4_FF_OP_LAYOUTRETURN, +}; + static unsigned short io_maxretrans; static const struct pnfs_commit_ops ff_layout_commit_ops; static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr); -static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, +static int +ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit); + int dev_limit, enum nfs4_ff_op_type type); static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, struct nfs4_ff_layout_mirror *mirror); @@ -1373,6 +1379,11 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, return -EIO; } + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_read_record_layoutstats_start(task, hdr); return 0; } @@ -1553,6 +1564,11 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1645,15 +1661,23 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags); } -static void ff_layout_commit_prepare_common(struct rpc_task *task, - struct nfs_commit_data *cdata) +static int ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) { + if (!pnfs_is_valid_lseg(cdata->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_commit_record_layoutstats_start(task, cdata); + return 0; } static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { - ff_layout_commit_prepare_common(task, data); + if (ff_layout_commit_prepare_common(task, data)) + return; + rpc_call_start(task); } @@ -1949,6 +1973,65 @@ ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, ff_layout_initiate_commit); } +static bool ff_layout_match_rw(const struct rpc_task *task, + const struct nfs_pgio_header *hdr, + const struct pnfs_layout_segment *lseg) +{ + return hdr->lseg == lseg; +} + +static bool ff_layout_match_commit(const struct rpc_task *task, + const struct nfs_commit_data *cdata, + const struct pnfs_layout_segment *lseg) +{ + return cdata->lseg == lseg; +} + +static bool ff_layout_match_io(const struct rpc_task *task, const void *data) +{ + const struct rpc_call_ops *ops = task->tk_ops; + + if (ops == &ff_layout_read_call_ops_v3 || + ops == &ff_layout_read_call_ops_v4 || + ops == &ff_layout_write_call_ops_v3 || + ops == &ff_layout_write_call_ops_v4) + return ff_layout_match_rw(task, task->tk_calldata, data); + if (ops == &ff_layout_commit_call_ops_v3 || + ops == &ff_layout_commit_call_ops_v4) + return ff_layout_match_commit(task, task->tk_calldata, data); + return false; +} + +static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds *mirror_ds; + struct nfs4_pnfs_ds *ds; + struct nfs_client *ds_clp; + struct rpc_clnt *clnt; + u32 idx; + + for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { + mirror = flseg->mirror_array[idx]; + mirror_ds = mirror->mirror_ds; + if (!mirror_ds) + continue; + ds = mirror->mirror_ds->ds; + if (!ds) + continue; + ds_clp = ds->ds_clp; + if (!ds_clp) + continue; + clnt = ds_clp->cl_rpcclient; + if (!clnt) + continue; + if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg)) + continue; + rpc_clnt_disconnect(clnt); + } +} + static struct pnfs_ds_commit_info * ff_layout_get_ds_info(struct inode *inode) { @@ -2161,8 +2244,9 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) FF_LAYOUTRETURN_MAXERR); spin_lock(&args->inode->i_lock); - ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo)); + ff_args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &ff_args->devinfo[0], + ARRAY_SIZE(ff_args->devinfo), NFS4_FF_OP_LAYOUTRETURN); spin_unlock(&args->inode->i_lock); args->ld_private->ops = &layoutreturn_ops; @@ -2396,7 +2480,7 @@ static const struct nfs4_xdr_opaque_ops layoutstat_ops = { static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit) + int dev_limit, enum nfs4_ff_op_type type) { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; @@ -2408,7 +2492,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, break; if (IS_ERR_OR_NULL(mirror->mirror_ds)) continue; - if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags)) + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, + &mirror->flags) && + type != NFS4_FF_OP_LAYOUTRETURN) continue; /* mirror refcount put in cleanup_layoutstats */ if (!refcount_inc_not_zero(&mirror->ref)) @@ -2448,7 +2534,9 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) spin_lock(&args->inode->i_lock); ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &args->devinfo[0], dev_count); + &args->devinfo[0], + dev_count, + NFS4_FF_OP_LAYOUTSTATS); spin_unlock(&args->inode->i_lock); if (!args->num_dev) { kfree(args->devinfo); @@ -2501,6 +2589,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .prepare_layoutreturn = ff_layout_prepare_layoutreturn, .sync = pnfs_nfs_generic_sync, .prepare_layoutstats = ff_layout_prepare_layoutstats, + .cancel_io = ff_layout_cancel_io, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 4da701fd1424..09833ec102fc 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -273,9 +273,9 @@ static const struct constant_table nfs_secflavor_tokens[] = { * Address family must be initialized, and address must not be * the ANY address for that family. */ -static int nfs_verify_server_address(struct sockaddr *addr) +static int nfs_verify_server_address(struct sockaddr_storage *addr) { - switch (addr->sa_family) { + switch (addr->ss_family) { case AF_INET: { struct sockaddr_in *sa = (struct sockaddr_in *)addr; return sa->sin_addr.s_addr != htonl(INADDR_ANY); @@ -969,7 +969,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_fh *mntfh = ctx->mntfh; - struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + struct sockaddr_storage *sap = &ctx->nfs_server._address; int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; int ret; @@ -1044,7 +1044,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, memcpy(sap, &data->addr, sizeof(data->addr)); ctx->nfs_server.addrlen = sizeof(data->addr); ctx->nfs_server.port = ntohs(data->addr.sin_port); - if (sap->sa_family != AF_INET || + if (sap->ss_family != AF_INET || !nfs_verify_server_address(sap)) goto out_no_address; @@ -1200,7 +1200,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc, struct nfs4_mount_data *data) { struct nfs_fs_context *ctx = nfs_fc2context(fc); - struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + struct sockaddr_storage *sap = &ctx->nfs_server._address; int ret; char *c; @@ -1314,7 +1314,7 @@ static int nfs_fs_context_validate(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_subversion *nfs_mod; - struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + struct sockaddr_storage *sap = &ctx->nfs_server._address; int max_namelen = PAGE_SIZE; int max_pathlen = NFS_MAXPATHLEN; int port = 0; @@ -1540,7 +1540,7 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->version = nfss->nfs_client->rpc_ops->version; ctx->minorversion = nfss->nfs_client->cl_minorversion; - memcpy(&ctx->nfs_server.address, &nfss->nfs_client->cl_addr, + memcpy(&ctx->nfs_server._address, &nfss->nfs_client->cl_addr, ctx->nfs_server.addrlen); if (fc->net_ns != net) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bea7c005119c..6b2cfa59a1a2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -72,18 +72,13 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -static int nfs_wait_killable(int mode) +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } - -int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) -{ - return nfs_wait_killable(mode); -} EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); /** @@ -318,7 +313,7 @@ struct nfs_find_desc { static int nfs_find_actor(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; @@ -336,7 +331,7 @@ nfs_find_actor(struct inode *inode, void *opaque) static int nfs_init_locked(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); @@ -1332,7 +1327,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) */ for (;;) { ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; spin_lock(&inode->i_lock); @@ -2271,7 +2267,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) static void init_once(void *foo) { - struct nfs_inode *nfsi = (struct nfs_inode *) foo; + struct nfs_inode *nfsi = foo; inode_init_once(&nfsi->vfs_inode); INIT_LIST_HEAD(&nfsi->open_files); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 898dd95bc7a7..647fc3f547cb 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -69,7 +69,7 @@ static inline fmode_t flags_to_mode(int flags) struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ - const struct sockaddr *addr; /* Address of the server */ + const struct sockaddr_storage *addr; /* Address of the server */ const char *nodename; /* Hostname of the client */ const char *ip_addr; /* IP address of the client */ size_t addrlen; @@ -180,7 +180,7 @@ static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc) /* mount_clnt.c */ struct nfs_mount_request { - struct sockaddr *sap; + struct sockaddr_storage *sap; size_t salen; char *hostname; char *dirpath; @@ -223,7 +223,7 @@ extern void nfs4_server_set_init_caps(struct nfs_server *); extern struct nfs_server *nfs4_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen, + struct sockaddr_storage *sap, size_t salen, struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, @@ -235,7 +235,7 @@ extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, @@ -243,7 +243,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans); #ifdef CONFIG_PROC_FS @@ -435,7 +435,6 @@ extern void nfs_zap_acl_cache(struct inode *inode); extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); -extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode); /* super.c */ extern const struct super_operations nfs_sops; @@ -503,7 +502,6 @@ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_commit_free(struct nfs_commit_data *p); -extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, @@ -896,13 +894,13 @@ static inline bool nfs_error_is_fatal_on_server(int err) * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ -static inline void nfs_set_port(struct sockaddr *sap, int *port, +static inline void nfs_set_port(struct sockaddr_storage *sap, int *port, const unsigned short default_port) { if (*port == NFS_UNSPEC_PORT) *port = default_port; - rpc_set_port(sap, *port); + rpc_set_port((struct sockaddr *)sap, *port); } struct nfs_direct_req { diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index c5e3b6b3366a..68e76b626371 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -158,7 +158,7 @@ int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans) struct rpc_create_args args = { .net = info->net, .protocol = info->protocol, - .address = info->sap, + .address = (struct sockaddr *)info->sap, .addrsize = info->salen, .timeout = &mnt_timeout, .servername = info->hostname, @@ -245,7 +245,7 @@ void nfs_umount(const struct nfs_mount_request *info) struct rpc_create_args args = { .net = info->net, .protocol = IPPROTO_UDP, - .address = info->sap, + .address = (struct sockaddr *)info->sap, .addrsize = info->salen, .timeout = &nfs_umnt_timeout, .servername = info->hostname, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 3295af4110f1..2f336ace7555 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -175,7 +175,7 @@ struct vfsmount *nfs_d_automount(struct path *path) } /* for submounts we want the same server; referrals will reassign */ - memcpy(&ctx->nfs_server.address, &client->cl_addr, client->cl_addrlen); + memcpy(&ctx->nfs_server._address, &client->cl_addr, client->cl_addrlen); ctx->nfs_server.addrlen = client->cl_addrlen; ctx->nfs_server.port = server->port; diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index b49359afac88..669cda757a5c 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -78,7 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, * the MDS. */ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; @@ -98,7 +98,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, char buf[INET6_ADDRSTRLEN + 1]; /* fake a hostname because lockd wants it */ - if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0) return ERR_PTR(-EINVAL); cl_init.hostname = buf; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1597eef40d54..2e7579626cf0 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -36,7 +36,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX) break; - freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index d37e4a5401b1..ecb428512fe1 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1093,6 +1093,9 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, &args.seq_args, &res.seq_res, 0); trace_nfs4_clone(src_inode, dst_inode, &args, status); if (status == 0) { + /* a zero-length count means clone to EOF in src */ + if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) + count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; nfs42_copy_dest_done(dst_inode, dst_offset, count); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } @@ -1175,6 +1178,7 @@ static int _nfs42_proc_removexattr(struct inode *inode, const char *name) ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + trace_nfs4_removexattr(inode, name, ret); if (!ret) nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); @@ -1214,6 +1218,7 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + trace_nfs4_setxattr(inode, name, ret); for (; np > 0; np--) put_page(pages[np - 1]); @@ -1246,6 +1251,7 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + trace_nfs4_getxattr(inode, name, ret); if (ret < 0) return ret; @@ -1317,6 +1323,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + trace_nfs4_listxattr(inode, ret); if (ret >= 0) { ret = res.copied; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index a9bf09fdf2c3..76ae11834206 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -981,7 +981,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) static void nfs4_xattr_cache_init_once(void *p) { - struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p; + struct nfs4_xattr_cache *cache = p; spin_lock_init(&cache->listxattr_lock); atomic_long_set(&cache->nent, 0); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index b56f05113d36..fe1aeb0f048f 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -569,6 +569,14 @@ static int decode_listxattrs(struct xdr_stream *xdr, */ if (status == -ETOOSMALL) status = -ERANGE; + /* + * Special case: for LISTXATTRS, NFS4ERR_NOXATTR + * should be translated to success with zero-length reply. + */ + if (status == -ENODATA) { + res->eof = true; + status = 0; + } goto out; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 79df6e83881b..cfef738d765e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -281,7 +281,7 @@ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, int nfs4_submount(struct fs_context *, struct nfs_server *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); -size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss, size_t salen, struct net *net, int port); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); @@ -459,7 +459,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); /* nfs4renewd.c */ extern void nfs4_schedule_state_renewal(struct nfs_client *); -extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 3c5678aec006..d3051b051a56 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -254,7 +254,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) goto error; ip_addr = (const char *)buf; } - strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strscpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); err = nfs_idmap_new(clp); if (err < 0) { @@ -346,6 +346,7 @@ int nfs40_init_client(struct nfs_client *clp) ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE, "NFSv4.0 transport Slot table"); if (ret) { + nfs4_shutdown_slot_table(tbl); kfree(tbl); return ret; } @@ -889,7 +890,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, */ static int nfs4_set_client(struct nfs_server *server, const char *hostname, - const struct sockaddr *addr, + const struct sockaddr_storage *addr, const size_t addrlen, const char *ip_addr, int proto, const struct rpc_timeout *timeparms, @@ -924,7 +925,7 @@ static int nfs4_set_client(struct nfs_server *server, __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); - server->port = rpc_get_port(addr); + server->port = rpc_get_port((struct sockaddr *)addr); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); @@ -960,7 +961,7 @@ static int nfs4_set_client(struct nfs_server *server, * the MDS. */ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version) { @@ -980,7 +981,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, }; char buf[INET6_ADDRSTRLEN + 1]; - if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0) return ERR_PTR(-EINVAL); cl_init.hostname = buf; @@ -1148,7 +1149,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) /* Get a client record */ error = nfs4_set_client(server, ctx->nfs_server.hostname, - &ctx->nfs_server.address, + &ctx->nfs_server._address, ctx->nfs_server.addrlen, ctx->client_address, ctx->nfs_server.protocol, @@ -1238,7 +1239,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT); error = nfs4_set_client(server, ctx->nfs_server.hostname, - &ctx->nfs_server.address, + &ctx->nfs_server._address, ctx->nfs_server.addrlen, parent_client->cl_ipaddr, XPRT_TRANSPORT_RDMA, @@ -1254,7 +1255,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) rpc_set_port(&ctx->nfs_server.address, NFS_PORT); error = nfs4_set_client(server, ctx->nfs_server.hostname, - &ctx->nfs_server.address, + &ctx->nfs_server._address, ctx->nfs_server.addrlen, parent_client->cl_ipaddr, XPRT_TRANSPORT_TCP, @@ -1303,14 +1304,14 @@ error: * Returns zero on success, or a negative errno value. */ int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen, struct net *net) + struct sockaddr_storage *sap, size_t salen, struct net *net) { struct nfs_client *clp = server->nfs_client; struct rpc_clnt *clnt = server->client; struct xprt_create xargs = { .ident = clp->cl_proto, .net = net, - .dstaddr = sap, + .dstaddr = (struct sockaddr *)sap, .addrlen = salen, .servername = hostname, }; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index ec6afd3c4bca..e3fdd2f45b01 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -583,7 +583,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; - struct idmap *idmap = (struct idmap *)aux; + struct idmap *idmap = aux; struct key *key = rka->target_key; int ret = -ENOKEY; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index f2dbf904c598..9a98595bb160 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -164,16 +164,17 @@ static int nfs4_validate_fspath(struct dentry *dentry, return 0; } -size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss, size_t salen, struct net *net, int port) { + struct sockaddr *sa = (struct sockaddr *)ss; ssize_t ret; ret = rpc_pton(net, string, len, sa, salen); if (ret == 0) { ret = rpc_uaddr2sockaddr(net, string, len, sa, salen); if (ret == 0) { - ret = nfs_dns_resolve_name(net, string, len, sa, salen); + ret = nfs_dns_resolve_name(net, string, len, ss, salen); if (ret < 0) ret = 0; } @@ -331,7 +332,7 @@ static int try_location(struct fs_context *fc, ctx->nfs_server.addrlen = nfs_parse_server_name(buf->data, buf->len, - &ctx->nfs_server.address, + &ctx->nfs_server._address, sizeof(ctx->nfs_server._address), fc->net_ns, 0); if (ctx->nfs_server.addrlen == 0) @@ -483,14 +484,13 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, char *page, char *page2, const struct nfs4_fs_location *location) { - const size_t addr_bufsize = sizeof(struct sockaddr_storage); struct net *net = rpc_net_ns(server->client); - struct sockaddr *sap; + struct sockaddr_storage *sap; unsigned int s; size_t salen; int error; - sap = kmalloc(addr_bufsize, GFP_KERNEL); + sap = kmalloc(sizeof(*sap), GFP_KERNEL); if (sap == NULL) return -ENOMEM; @@ -506,10 +506,10 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, continue; salen = nfs_parse_server_name(buf->data, buf->len, - sap, addr_bufsize, net, 0); + sap, sizeof(*sap), net, 0); if (salen == 0) continue; - rpc_set_port(sap, NFS_PORT); + rpc_set_port((struct sockaddr *)sap, NFS_PORT); error = -ENOMEM; hostname = kmemdup_nul(buf->data, buf->len, GFP_KERNEL); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3ed14a2a84a4..86ed5c0142c3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -416,8 +416,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); - freezable_schedule_timeout_killable_unsafe( - nfs4_update_delay(timeout)); + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) return 0; return -EINTR; @@ -427,7 +427,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); - freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout)); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) return 0; return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS; @@ -3950,7 +3951,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, for (i = 0; i < location->nservers; i++) { struct nfs4_string *srv_loc = &location->servers[i]; - struct sockaddr addr; + struct sockaddr_storage addr; size_t addrlen; struct xprt_create xprt_args = { .ident = 0, @@ -3973,7 +3974,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, clp->cl_net, server->port); if (!addrlen) return; - xprt_args.dstaddr = &addr; + xprt_args.dstaddr = (struct sockaddr *)&addr; xprt_args.addrlen = addrlen; servername = kmalloc(srv_loc->len + 1, GFP_KERNEL); if (!servername) @@ -6607,7 +6608,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) struct nfs4_delegreturndata *d_data; struct pnfs_layout_hdr *lo; - d_data = (struct nfs4_delegreturndata *)data; + d_data = data; if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) { nfs4_sequence_done(task, &d_data->res.seq_res); @@ -7137,6 +7138,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) { struct nfs4_lockdata *data = calldata; struct nfs4_lock_state *lsp = data->lsp; + struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry)); if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -7144,8 +7146,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; switch (task->tk_status) { case 0: - renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), - data->timestamp); + renew_lease(server, data->timestamp); if (data->arg.new_lock && !data->cancelled) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) @@ -7166,6 +7167,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) if (!nfs4_stateid_match(&data->arg.open_stateid, &lsp->ls_state->open_stateid)) goto out_restart; + else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN) + goto out_restart; } else if (!nfs4_stateid_match(&data->arg.lock_stateid, &lsp->ls_stateid)) goto out_restart; @@ -7406,7 +7409,8 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) break; - freezable_schedule_timeout_interruptible(timeout); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(timeout); timeout *= 2; timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); status = -ERESTARTSYS; @@ -7474,10 +7478,8 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) break; status = -ERESTARTSYS; - freezer_do_not_count(); - wait_woken(&waiter.wait, TASK_INTERRUPTIBLE, + wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE, NFS4_LOCK_MAXTIMEOUT); - freezer_count(); } while (!signalled()); remove_wait_queue(q, &waiter.wait); @@ -8900,7 +8902,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred) void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + struct nfs4_add_xprt_data *adata = data; struct rpc_task *task; int status; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9bab3e9c702a..a2d2d5d1b088 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -497,8 +497,7 @@ nfs4_alloc_state_owner(struct nfs_server *server, sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; - sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0, - gfp_flags); + sp->so_seqid.owner_id = ida_alloc(&server->openowner_id, gfp_flags); if (sp->so_seqid.owner_id < 0) { kfree(sp); return NULL; @@ -534,7 +533,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { nfs4_destroy_seqid_counter(&sp->so_seqid); put_cred(sp->so_cred); - ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id); + ida_free(&sp->so_server->openowner_id, sp->so_seqid.owner_id); kfree(sp); } @@ -877,8 +876,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; lsp->ls_owner = fl_owner; - lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, - 0, 0, GFP_KERNEL_ACCOUNT); + lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT); if (lsp->ls_seqid.owner_id < 0) goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); @@ -890,7 +888,7 @@ out_free: void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); + ida_free(&server->lockowner_id, lsp->ls_seqid.owner_id); nfs4_destroy_seqid_counter(&lsp->ls_seqid); kfree(lsp); } @@ -1314,7 +1312,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) refcount_inc(&clp->cl_count); res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (res) goto out; if (clp->cl_cons_state < 0) @@ -1787,6 +1786,7 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) { + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); /* Mark all delegations for reclaim */ nfs_delegation_mark_reclaim(clp); nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); @@ -2671,6 +2671,7 @@ static void nfs4_state_manager(struct nfs_client *clp) if (status < 0) goto out_error; nfs4_state_end_reclaim_reboot(clp); + continue; } /* Detect expired delegations... */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 6ee6ad3674a2..2cff5901c689 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2097,6 +2097,7 @@ TRACE_EVENT(ff_layout_commit_error, ) ); +#ifdef CONFIG_NFS_V4_2 TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA); TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); @@ -2105,7 +2106,6 @@ TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); { NFS4_CONTENT_DATA, "DATA" }, \ { NFS4_CONTENT_HOLE, "HOLE" }) -#ifdef CONFIG_NFS_V4_2 TRACE_EVENT(nfs4_llseek, TP_PROTO( const struct inode *inode, @@ -2496,6 +2496,54 @@ TRACE_EVENT(nfs4_offload_cancel, __entry->stateid_seq, __entry->stateid_hash ) ); + +DECLARE_EVENT_CLASS(nfs4_xattr_event, + TP_PROTO( + const struct inode *inode, + const char *name, + int error + ), + + TP_ARGS(inode, name, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(name, name) + ), + + TP_fast_assign( + __entry->error = error < 0 ? -error : 0; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __assign_str(name, name); + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "name=%s", + -__entry->error, show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __get_str(name) + ) +); +#define DEFINE_NFS4_XATTR_EVENT(name) \ + DEFINE_EVENT(nfs4_xattr_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const char *name, \ + int error \ + ), \ + TP_ARGS(inode, name, error)) +DEFINE_NFS4_XATTR_EVENT(nfs4_getxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_setxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_removexattr); + +DEFINE_NFS4_INODE_EVENT(nfs4_listxattr); #endif /* CONFIG_NFS_V4_2 */ #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index fa148308822c..620329b7e6ae 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -139,7 +139,7 @@ static int __init nfs_root_setup(char *line) ROOT_DEV = Root_NFS; if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { - strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); + strscpy(nfs_root_parms, line, sizeof(nfs_root_parms)); } else { size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; if (n >= sizeof(nfs_root_parms)) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2613b7e36eb9..a5db5158c634 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -710,6 +710,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -722,8 +723,10 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); - if (!mark_lseg_invalid(lseg, tmp_list)) - remaining++; + if (mark_lseg_invalid(lseg, tmp_list)) + continue; + remaining++; + pnfs_lseg_cancel_io(server, lseg); } dprintk("%s:Return %i\n", __func__, remaining); return remaining; @@ -1908,7 +1911,7 @@ static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) pnfs_layoutcommit_inode(lo->plh_inode, false); return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); } static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) @@ -2485,6 +2488,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -2507,6 +2511,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + pnfs_lseg_cancel_io(server, lseg); } if (remaining) { @@ -3192,7 +3197,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (status) goto out; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f331f067691b..e3e6a41f19de 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -169,6 +169,8 @@ struct pnfs_layoutdriver_type { void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); + + void (*cancel_io)(struct pnfs_layout_segment *lseg); }; struct pnfs_commit_ops { @@ -685,6 +687,13 @@ pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page req_offset(req), req_last); } +static inline void pnfs_lseg_cancel_io(struct nfs_server *server, + struct pnfs_layout_segment *lseg) +{ + if (server->pnfs_curr_ld->cancel_io) + server->pnfs_curr_ld->cancel_io(lseg); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 657c242a18ff..5d035dd2d7bf 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -374,12 +374,12 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, return NULL; } -/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest +/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request * for @page * @cinfo - commit info for current inode * @page - page to search for matching head request * - * Returns a the head request if one is found, otherwise returns NULL. + * Return: the head request if one is found, otherwise %NULL. */ struct nfs_page * pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) @@ -821,7 +821,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) static struct nfs_client *(*get_v3_ds_connect)( struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, @@ -882,7 +882,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, continue; } clp = get_v3_ds_connect(mds_srv, - (struct sockaddr *)&da->da_addr, + &da->da_addr, da->da_addrlen, da->da_transport, timeo, retrans); if (IS_ERR(clp)) @@ -951,7 +951,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, put_cred(xprtdata.cred); } else { clp = nfs4_set_ds_client(mds_srv, - (struct sockaddr *)&da->da_addr, + &da->da_addr, da->da_addrlen, da->da_transport, timeo, retrans, minor_version); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ee66ffdb985e..05ae23657527 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -822,8 +822,7 @@ static int nfs_request_mount(struct fs_context *fc, { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_mount_request request = { - .sap = (struct sockaddr *) - &ctx->mount_server.address, + .sap = &ctx->mount_server._address, .dirpath = ctx->nfs_server.export_path, .protocol = ctx->mount_server.protocol, .fh = root_fh, @@ -854,7 +853,7 @@ static int nfs_request_mount(struct fs_context *fc, * Construct the mount server's address. */ if (ctx->mount_server.address.sa_family == AF_UNSPEC) { - memcpy(request.sap, &ctx->nfs_server.address, + memcpy(request.sap, &ctx->nfs_server._address, ctx->nfs_server.addrlen); ctx->mount_server.addrlen = ctx->nfs_server.addrlen; } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index d5c57360b418..adc4e87a71d2 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -405,22 +405,15 @@ nfsd_file_unhash(struct nfsd_file *nf) return false; } -/* - * Return true if the file was unhashed. - */ -static bool +static void nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose) { trace_nfsd_file_unhash_and_dispose(nf); - if (!nfsd_file_unhash(nf)) - return false; - /* keep final reference for nfsd_file_lru_dispose */ - if (refcount_dec_not_one(&nf->nf_ref)) - return true; - - nfsd_file_lru_remove(nf); - list_add(&nf->nf_lru, dispose); - return true; + if (nfsd_file_unhash(nf)) { + /* caller must call nfsd_file_dispose_list() later */ + nfsd_file_lru_remove(nf); + list_add(&nf->nf_lru, dispose); + } } static void @@ -562,8 +555,6 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) * @lock: LRU list lock (unused) * @arg: dispose list * - * Note this can deadlock with nfsd_file_cache_purge. - * * Return values: * %LRU_REMOVED: @item was removed from the LRU * %LRU_ROTATE: @item is to be moved to the LRU tail @@ -748,8 +739,6 @@ nfsd_file_close_inode(struct inode *inode) * * Walk the LRU list and close any entries that have not been used since * the last scan. - * - * Note this can deadlock with nfsd_file_cache_purge. */ static void nfsd_file_delayed_close(struct work_struct *work) @@ -891,16 +880,12 @@ out_err: goto out; } -/* - * Note this can deadlock with nfsd_file_lru_cb. - */ static void __nfsd_file_cache_purge(struct net *net) { struct rhashtable_iter iter; struct nfsd_file *nf; LIST_HEAD(dispose); - bool del; rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter); do { @@ -908,16 +893,8 @@ __nfsd_file_cache_purge(struct net *net) nf = rhashtable_walk_next(&iter); while (!IS_ERR_OR_NULL(nf)) { - if (net && nf->nf_net != net) - continue; - del = nfsd_file_unhash_and_dispose(nf, &dispose); - - /* - * Deadlock detected! Something marked this entry as - * unhased, but hasn't removed it from the hash list. - */ - WARN_ON_ONCE(!del); - + if (!net || nf->nf_net == net) + nfsd_file_unhash_and_dispose(nf, &dispose); nf = rhashtable_walk_next(&iter); } @@ -1064,9 +1041,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, .need = may_flags & NFSD_FILE_MAY_MASK, .net = SVC_NET(rqstp), }; - struct nfsd_file *nf, *new; - bool retry = true; + bool open_retry = true; + struct nfsd_file *nf; __be32 status; + int ret; status = fh_verify(rqstp, fhp, S_IFREG, may_flags|NFSD_MAY_OWNER_OVERRIDE); @@ -1076,35 +1054,33 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, key.cred = get_current_cred(); retry: - /* Avoid allocation if the item is already in cache */ - nf = rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, - nfsd_file_rhash_params); + rcu_read_lock(); + nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, + nfsd_file_rhash_params); if (nf) nf = nfsd_file_get(nf); + rcu_read_unlock(); if (nf) goto wait_for_construction; - new = nfsd_file_alloc(&key, may_flags); - if (!new) { + nf = nfsd_file_alloc(&key, may_flags); + if (!nf) { status = nfserr_jukebox; goto out_status; } - nf = rhashtable_lookup_get_insert_key(&nfsd_file_rhash_tbl, - &key, &new->nf_rhash, - nfsd_file_rhash_params); - if (!nf) { - nf = new; - goto open_file; - } - if (IS_ERR(nf)) - goto insert_err; - nf = nfsd_file_get(nf); - if (nf == NULL) { - nf = new; + ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl, + &key, &nf->nf_rhash, + nfsd_file_rhash_params); + if (likely(ret == 0)) goto open_file; - } - nfsd_file_slab_free(&new->nf_rcu); + + nfsd_file_slab_free(&nf->nf_rcu); + if (ret == -EEXIST) + goto retry; + trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret); + status = nfserr_jukebox; + goto out_status; wait_for_construction: wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); @@ -1112,11 +1088,11 @@ wait_for_construction: /* Did construction of this file fail? */ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); - if (!retry) { + if (!open_retry) { status = nfserr_jukebox; goto out; } - retry = false; + open_retry = false; nfsd_file_put_noref(nf); goto retry; } @@ -1164,13 +1140,6 @@ open_file: smp_mb__after_atomic(); wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); goto out; - -insert_err: - nfsd_file_slab_free(&new->nf_rcu); - trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, PTR_ERR(nf)); - nf = NULL; - status = nfserr_jukebox; - goto out_status; } /** diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 198d7abf34e4..4e718500a00c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4375,8 +4375,8 @@ nfsd4_init_leases_net(struct nfsd_net *nn) nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->track_reclaim_completes = false; - nn->clverifier_counter = prandom_u32(); - nn->clientid_base = prandom_u32(); + nn->clverifier_counter = get_random_u32(); + nn->clientid_base = get_random_u32(); nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6a29bcfc9390..dc74a947a440 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1458,12 +1458,14 @@ static __net_init int nfsd_init_net(struct net *net) goto out_drc_error; retval = nfsd_reply_cache_init(nn); if (retval) - goto out_drc_error; + goto out_cache_error; get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); return 0; +out_cache_error: + nfsd4_leases_net_shutdown(nn); out_drc_error: nfsd_idmap_shutdown(net); out_idmap_error: diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index d73434200df9..8c52b6c9d31a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -392,8 +392,8 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); - trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); out: + trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); if (error == nfserr_stale) nfsd_stats_fh_stale_inc(exp); return error; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 9f4d9432d38a..b9d15c3df3cc 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1668,8 +1668,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? nilfs_btree_node_get_key(node, nchildren - 2) : 0; - if (bh != NULL) - brelse(bh); + brelse(bh); return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } @@ -1717,8 +1716,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *btree, ptrs[i] = le64_to_cpu(dptrs[i]); } - if (bh != NULL) - brelse(bh); + brelse(bh); return nitems; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 67f63cfeade5..232dd7b6cca1 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -328,6 +328,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) struct inode *inode; struct nilfs_inode_info *ii; struct nilfs_root *root; + struct buffer_head *bh; int err = -ENOMEM; ino_t ino; @@ -343,11 +344,25 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) ii->i_state = BIT(NILFS_I_NEW); ii->i_root = root; - err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); if (unlikely(err)) goto failed_ifile_create_inode; /* reference count of i_bh inherits from nilfs_mdt_read_block() */ + if (unlikely(ino < NILFS_USER_INO)) { + nilfs_warn(sb, + "inode bitmap is inconsistent for reserved inodes"); + do { + brelse(bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + } while (ino < NILFS_USER_INO); + + nilfs_info(sb, "repaired inode bitmap for reserved inodes"); + } + ii->i_bh = bh; + atomic64_inc(&root->inodes_count); inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; @@ -440,6 +455,8 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) + return -EIO; /* this inode is for metadata and corrupted */ if (inode->i_nlink == 0) return -ESTALE; /* this inode is deleted */ diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 3267e96c256c..39b7eea2642a 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -480,41 +480,36 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff) { - unsigned int i; + unsigned int i, nr_folios; pgoff_t index; - unsigned int nblocks_in_page; unsigned long length = 0; - sector_t b; - struct pagevec pvec; - struct page *page; + struct folio_batch fbatch; + struct folio *folio; if (inode->i_mapping->nrpages == 0) return 0; index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); - nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, - pvec.pages); - if (pvec.nr == 0) + nr_folios = filemap_get_folios_contig(inode->i_mapping, &index, ULONG_MAX, + &fbatch); + if (nr_folios == 0) return length; - if (length > 0 && pvec.pages[0]->index > index) - goto out; - - b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits); i = 0; do { - page = pvec.pages[i]; + folio = fbatch.folios[i]; - lock_page(page); - if (page_has_buffers(page)) { + folio_lock(folio); + if (folio_buffers(folio)) { struct buffer_head *bh, *head; + sector_t b; - bh = head = page_buffers(page); + b = folio->index << (PAGE_SHIFT - inode->i_blkbits); + bh = head = folio_buffers(folio); do { if (b < start_blk) continue; @@ -529,21 +524,17 @@ repeat: } else { if (length > 0) goto out_locked; - - b += nblocks_in_page; } - unlock_page(page); + folio_unlock(folio); - } while (++i < pagevec_count(&pvec)); + } while (++i < nr_folios); - index = page->index + 1; - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; out_locked: - unlock_page(page); -out: - pagevec_release(&pvec); + folio_unlock(folio); + folio_batch_release(&fbatch); return length; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0afe0832c754..b4cebad21b48 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -875,9 +875,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) nilfs_mdt_mark_dirty(nilfs->ns_cpfile); nilfs_cpfile_put_checkpoint( nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); - } else - WARN_ON(err == -EINVAL || err == -ENOENT); - + } else if (err == -EINVAL || err == -ENOENT) { + nilfs_error(sci->sc_super, + "checkpoint creation failed due to metadata corruption."); + err = -EIO; + } return err; } @@ -891,7 +893,11 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, &raw_cp, &bh_cp); if (unlikely(err)) { - WARN_ON(err == -EINVAL || err == -ENOENT); + if (err == -EINVAL || err == -ENOENT) { + nilfs_error(sci->sc_super, + "checkpoint finalization failed due to metadata corruption."); + err = -EIO; + } goto failed_ibh; } raw_cp->cp_snapshot_list.ssl_next = 0; @@ -2235,7 +2241,6 @@ int nilfs_construct_segment(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_transaction_info *ti; - int err; if (!sci) return -EROFS; @@ -2243,8 +2248,7 @@ int nilfs_construct_segment(struct super_block *sb) /* A call inside transactions causes a deadlock. */ BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); - err = nilfs_segctor_sync(sci); - return err; + return nilfs_segctor_sync(sci); } /** @@ -2786,10 +2790,9 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL); err = nilfs_segctor_start_thread(nilfs->ns_writer); - if (err) { - kfree(nilfs->ns_writer); - nilfs->ns_writer = NULL; - } + if (unlikely(err)) + nilfs_detach_log_writer(sb); + return err; } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index bf6d4d38afa0..57f51a9a3015 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -452,12 +452,6 @@ static inline bool fanotify_is_error_event(u32 mask) return mask & FAN_FS_ERROR; } -static inline bool fanotify_event_has_path(struct fanotify_event *event) -{ - return event->type == FANOTIFY_EVENT_TYPE_PATH || - event->type == FANOTIFY_EVENT_TYPE_PATH_PERM; -} - static inline const struct path *fanotify_event_path(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_PATH) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 87d8a50ee803..fde74eb333cc 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -76,10 +76,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) */ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); -/* allocate and destroy and event holder to attach events to notification/access queues */ -extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); -extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); - extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 52615e6090e1..a3865bc4a0c6 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -594,17 +594,37 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { u8 *mrec_end = (u8 *)ctx->mrec + le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || - name_end > mrec_end) + u8 *name_end; + + /* check whether ATTR_RECORD wrap */ + if ((u8 *)a < (u8 *)ctx->mrec) + break; + + /* check whether Attribute Record Header is within bounds */ + if ((u8 *)a > mrec_end || + (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) + break; + + /* check whether ATTR_RECORD's name is within bounds */ + name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if (name_end > mrec_end) break; + ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || a->type == AT_END)) return -ENOENT; if (unlikely(!a->length)) break; + + /* check whether ATTR_RECORD's length wrap */ + if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) + break; + /* check whether ATTR_RECORD's length is within bounds */ + if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) + break; + if (a->type != type) continue; /* diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index db0f1995aedd..08c659332e26 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1829,6 +1829,13 @@ int ntfs_read_inode_mount(struct inode *vi) goto err_out; } + /* Sanity check offset to the first attribute */ + if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { + ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", + le16_to_cpu(m->attrs_offset)); + goto err_out; + } + /* Need this to sanity check attribute list references to $MFT. */ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 5d44ceac855b..e92bbd754365 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -560,7 +560,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd) buf = (ulong *)bh->b_data; - used = __bitmap_weight(buf, wbits); + used = bitmap_weight(buf, wbits); if (used < wbits) { frb = wbits - used; wnd->free_bits[iw] = frb; @@ -1364,7 +1364,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) buf = (ulong *)bh->b_data; __bitmap_clear(buf, b0, blocksize * 8 - b0); - frb = wbits - __bitmap_weight(buf, wbits); + frb = wbits - bitmap_weight(buf, wbits); wnd->total_zeroes += frb - wnd->free_bits[iw]; wnd->free_bits[iw] = frb; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index e7c494005122..0d611a6c5511 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -3819,7 +3819,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) } log_init_pg_hdr(log, page_size, page_size, 1, 1); - log_create(log, l_size, 0, get_random_int(), false, false); + log_create(log, l_size, 0, get_random_u32(), false, false); log->ra = ra; @@ -3893,7 +3893,7 @@ check_restart_area: /* Do some checks based on whether we have a valid log page. */ if (!rst_info.valid_page) { - open_log_count = get_random_int(); + open_log_count = get_random_u32(); goto init_log_instance; } open_log_count = le32_to_cpu(ra2->open_log_count); @@ -4044,7 +4044,7 @@ find_oldest: memcpy(ra->clients, Add2Ptr(ra2, t16), le16_to_cpu(ra2->ra_len) - t16); - log->current_openlog_count = get_random_int(); + log->current_openlog_count = get_random_u32(); ra->open_log_count = cpu_to_le32(log->current_openlog_count); log->ra_size = offsetof(struct RESTART_AREA, clients) + sizeof(struct CLIENT_REC); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 26a76ebfe58f..d5a3afbbbfd8 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -630,12 +630,9 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); set_bh_page(bh, page, off); - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; + err = bh_read(bh, 0); + if (err < 0) goto out; - } zero_user_segment(page, off + voff, off + block_size); } } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index af4157f61927..1d65f6ef00ca 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -636,7 +636,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, !buffer_new(bh) && ocfs2_should_read_blk(inode, page, block_start) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 1, &bh); + bh_read_nowait(bh, 0); *wait_bh++=bh; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 961d1cf54388..05f32989bad6 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -232,6 +232,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, handle_t *handle = NULL; struct ocfs2_super *osb; struct ocfs2_dinode *dirfe; + struct ocfs2_dinode *fe = NULL; struct buffer_head *new_fe_bh = NULL; struct inode *inode = NULL; struct ocfs2_alloc_context *inode_ac = NULL; @@ -382,6 +383,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, goto leave; } + fe = (struct ocfs2_dinode *) new_fe_bh->b_data; if (S_ISDIR(mode)) { status = ocfs2_fill_new_dir(osb, handle, dir, inode, new_fe_bh, data_ac, meta_ac); @@ -454,8 +456,11 @@ roll_back: leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -632,18 +637,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } - status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { - u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); - int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, - inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); - if (tmp) - mlog_errno(tmp); - } - - return status; } static int ocfs2_mkdir(struct user_namespace *mnt_userns, @@ -2028,8 +2024,11 @@ bail: ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 638d875eccc7..7aebdbf5cc0a 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -527,7 +527,7 @@ struct ocfs2_extent_block * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { -/*00*/ __le16 sm_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. @@ -548,7 +548,7 @@ struct ocfs2_extended_slot { * i_size. */ struct ocfs2_slot_map_extended { -/*00*/ struct ocfs2_extended_slot se_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) @@ -727,7 +727,7 @@ struct ocfs2_dinode { struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; - __u8 i_symlink[0]; + DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; @@ -892,7 +892,7 @@ struct ocfs2_group_desc /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { - __u8 bg_bitmap[0]; + DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1358981e80a3..623db358b1ef 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2614,7 +2614,7 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, } /* - * Calculate out the start and number of virtual clusters we need to to CoW. + * Calculate out the start and number of virtual clusters we need to CoW. * * cpos is vitual start cluster position we want to do CoW in a * file and write_len is the cluster length. diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index dd77b7aaabf5..317126261523 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -334,10 +334,10 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); + strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) - strlcpy(new_conn->cc_cluster_name, cluster_name, + strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 5805a03d100b..9c74eace3adc 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -106,7 +106,7 @@ int ocfs2_claim_clusters(handle_t *handle, u32 *cluster_start, u32 *num_clusters); /* - * Use this variant of ocfs2_claim_clusters to specify a maxiumum + * Use this variant of ocfs2_claim_clusters to specify a maximum * number of clusters smaller than the allocation reserved. */ int __ocfs2_claim_clusters(handle_t *handle, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e2cc9eec287c..42c993e53924 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1764,9 +1764,7 @@ static int ocfs2_get_sector(struct super_block *sb, if (!buffer_dirty(*bh)) clear_buffer_uptodate(*bh); unlock_buffer(*bh); - ll_rw_block(REQ_OP_READ, 1, bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { + if (bh_read(*bh, 0) < 0) { mlog_errno(-EIO); brelse(*bh); *bh = NULL; @@ -2221,7 +2219,7 @@ static int ocfs2_initialize_super(struct super_block *sb, goto out_journal; } - strlcpy(osb->vol_label, di->id2.i_super.s_label, + strscpy(osb->vol_label, di->id2.i_super.s_label, OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index e2c2699d8016..9cacce5d55c1 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -398,7 +398,7 @@ static int orangefs_dir_release(struct inode *inode, struct file *file) const struct file_operations orangefs_dir_operations = { .llseek = orangefs_dir_llseek, .read = generic_read_dir, - .iterate = orangefs_dir_iterate, + .iterate_shared = orangefs_dir_iterate, .open = orangefs_dir_open, .release = orangefs_dir_release }; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 3ffea291c410..f436d8847f08 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -193,11 +193,11 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old, return ovl_real_fileattr_set(new, &newfa); } -static int ovl_copy_up_data(struct ovl_fs *ofs, const struct path *old, - const struct path *new, loff_t len) +static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, + struct file *new_file, loff_t len) { + struct path datapath; struct file *old_file; - struct file *new_file; loff_t old_pos = 0; loff_t new_pos = 0; loff_t cloned; @@ -206,23 +206,18 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, const struct path *old, bool skip_hole = false; int error = 0; - if (len == 0) - return 0; + ovl_path_lowerdata(dentry, &datapath); + if (WARN_ON(datapath.dentry == NULL)) + return -EIO; - old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY); + old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); if (IS_ERR(old_file)) return PTR_ERR(old_file); - new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY); - if (IS_ERR(new_file)) { - error = PTR_ERR(new_file); - goto out_fput; - } - /* Try to use clone_file_range to clone up within the same fs */ cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); if (cloned == len) - goto out; + goto out_fput; /* Couldn't clone, so now we try to copy the data */ /* Check if lower fs supports seek operation */ @@ -282,10 +277,8 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, const struct path *old, len -= bytes; } -out: if (!error && ovl_should_sync(ofs)) error = vfs_fsync(new_file, 0); - fput(new_file); out_fput: fput(old_file); return error; @@ -556,30 +549,31 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) return err; } -static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) +static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); - struct inode *inode = d_inode(c->dentry); - struct path upperpath, datapath; + struct file *new_file; int err; - ovl_path_upper(c->dentry, &upperpath); - if (WARN_ON(upperpath.dentry != NULL)) - return -EIO; + if (!S_ISREG(c->stat.mode) || c->metacopy || !c->stat.size) + return 0; - upperpath.dentry = temp; + new_file = ovl_path_open(temp, O_LARGEFILE | O_WRONLY); + if (IS_ERR(new_file)) + return PTR_ERR(new_file); - /* - * Copy up data first and then xattrs. Writing data after - * xattrs will remove security.capability xattr automatically. - */ - if (S_ISREG(c->stat.mode) && !c->metacopy) { - ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(ofs, &datapath, &upperpath, - c->stat.size); - if (err) - return err; - } + err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size); + fput(new_file); + + return err; +} + +static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) +{ + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *inode = d_inode(c->dentry); + struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = temp }; + int err; err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp); if (err) @@ -662,6 +656,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); + struct path path = { .mnt = ovl_upper_mnt(ofs) }; struct dentry *temp, *upper; struct ovl_cu_creds cc; int err; @@ -688,7 +683,16 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (IS_ERR(temp)) goto unlock; - err = ovl_copy_up_inode(c, temp); + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + path.dentry = temp; + err = ovl_copy_up_data(c, &path); + if (err) + goto cleanup; + + err = ovl_copy_up_metadata(c, temp); if (err) goto cleanup; @@ -732,6 +736,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(c->destdir); struct dentry *temp, *upper; + struct file *tmpfile; struct ovl_cu_creds cc; int err; @@ -739,15 +744,22 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) return err; - temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); ovl_revert_cu_creds(&cc); - if (IS_ERR(temp)) - return PTR_ERR(temp); + if (IS_ERR(tmpfile)) + return PTR_ERR(tmpfile); - err = ovl_copy_up_inode(c, temp); + temp = tmpfile->f_path.dentry; + if (!c->metacopy && c->stat.size) { + err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size); + if (err) + return err; + } + + err = ovl_copy_up_metadata(c, temp); if (err) - goto out_dput; + goto out_fput; inode_lock_nested(udir, I_MUTEX_PARENT); @@ -761,16 +773,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) inode_unlock(udir); if (err) - goto out_dput; + goto out_fput; if (!c->metacopy) ovl_set_upperdata(d_inode(c->dentry)); - ovl_inode_update(d_inode(c->dentry), temp); + ovl_inode_update(d_inode(c->dentry), dget(temp)); - return 0; - -out_dput: - dput(temp); +out_fput: + fput(tmpfile); return err; } @@ -899,7 +909,7 @@ static ssize_t ovl_getxattr_value(const struct path *path, char *name, char **va static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); - struct path upperpath, datapath; + struct path upperpath; int err; char *capability = NULL; ssize_t cap_size; @@ -908,10 +918,6 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (WARN_ON(upperpath.dentry == NULL)) return -EIO; - ovl_path_lowerdata(c->dentry, &datapath); - if (WARN_ON(datapath.dentry == NULL)) - return -EIO; - if (c->stat.size) { err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS, &capability); @@ -919,7 +925,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) goto out; } - err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size); + err = ovl_copy_up_data(c, &upperpath); if (err) goto out_free; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 59624521eeb2..eee8f08d32b6 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -310,14 +310,16 @@ static inline int ovl_do_whiteout(struct ovl_fs *ofs, return err; } -static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs, - struct dentry *dentry, umode_t mode) +static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, + struct dentry *dentry, umode_t mode) { - struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0); - int err = PTR_ERR_OR_ZERO(ret); + struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; + struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode, + O_LARGEFILE | O_WRONLY, current_cred()); + int err = PTR_ERR_OR_ZERO(file); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); - return ret; + return file; } static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 9ca98bea8e18..a29a8afe9b26 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> #include <linux/exportfs.h> +#include <linux/file.h> #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); @@ -1369,7 +1370,8 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, const struct path *workpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); - struct dentry *temp, *workdir; + struct dentry *workdir; + struct file *tmpfile; bool rename_whiteout; bool d_type; int fh_type; @@ -1405,10 +1407,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ - temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); - ofs->tmpfile = !IS_ERR(temp); + tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); + ofs->tmpfile = !IS_ERR(tmpfile); if (ofs->tmpfile) - dput(temp); + fput(tmpfile); else pr_warn("upper fs does not support tmpfile.\n"); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index b4f109875e79..74dc0f571dc9 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -24,6 +24,7 @@ #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/mnt_idmapping.h> +#include <linux/iversion.h> static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -1227,6 +1228,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, } inode->i_ctime = current_time(inode); + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); set_cached_acl(inode, type, acl); return 0; } diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index c930001056f9..32b1116ae137 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -92,6 +92,7 @@ config PROC_PAGE_MONITOR config PROC_CHILDREN bool "Include /proc/<pid>/task/<tid>/children file" + depends on PROC_FS default n help Provides a fast way to retrieve first level children pids of a task. See diff --git a/fs/proc/array.c b/fs/proc/array.c index 99fcbfda8e25..49283b8103c7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -279,7 +279,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ - qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); + qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); diff --git a/fs/proc/base.c b/fs/proc/base.c index 2d9429bf51fa..9e479d7d202b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2350,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + struct vma_iterator vmi; genradix_init(&fa); @@ -2388,7 +2389,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) * routine might require mmap_lock taken in might_fault(). */ - for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + pos = 2; + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) @@ -3196,6 +3199,19 @@ static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace * return 0; } +static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + mmput(mm); + } + + return 0; +} #endif /* CONFIG_KSM */ #ifdef CONFIG_STACKLEAK_METRICS @@ -3331,6 +3347,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; @@ -3668,6 +3685,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 837971e74109..fe7bfcb7d049 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -4,6 +4,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/blkdev.h> +#include "internal.h" static int devinfo_show(struct seq_file *f, void *v) { @@ -54,7 +55,10 @@ static const struct seq_operations devinfo_ops = { static int __init proc_devices_init(void) { - proc_create_seq("devices", 0, NULL, &devinfo_ops); + struct proc_dir_entry *pde; + + pde = proc_create_seq("devices", 0, NULL, &devinfo_ops); + pde_make_permanent(pde); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 06a80f78433d..b701d0207edf 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,6 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +static inline void pde_make_permanent(struct proc_dir_entry *pde) +{ + pde->flags |= PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -285,7 +290,7 @@ struct proc_maps_private { struct task_struct *task; struct mm_struct *mm; #ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; + struct vma_iterator iter; #endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 592e6dc7c110..2fc92a13f9f8 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -17,8 +17,6 @@ #include <asm/io.h> -extern wait_queue_head_t log_wait; - static int kmsg_open(struct inode * inode, struct file * file) { return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index f32878d9a39f..817981e57223 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/seqlock.h> #include <linux/time.h> +#include "internal.h" static int loadavg_proc_show(struct seq_file *m, void *v) { @@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v) static int __init proc_loadavg_init(void) { - proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6e89f0e2fd20..5101131e6047 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -115,6 +115,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif show_val_kb(m, "PageTables: ", global_node_page_state(NR_PAGETABLE)); + show_val_kb(m, "SecPageTables: ", + global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", @@ -162,7 +164,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) static int __init proc_meminfo_init(void) { - proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index a2873a617ae8..f2273b164535 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -91,6 +91,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecount_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; @@ -268,6 +269,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, } static const struct proc_ops kpageflags_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpageflags_read, }; @@ -322,6 +324,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecgroup_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecgroup_read, }; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 021e83fe831f..48f2d60bd78a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -28,13 +28,6 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; -/* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; -EXPORT_SYMBOL(sysctl_vals); - -const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; -EXPORT_SYMBOL_GPL(sysctl_long_vals); - /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { @@ -1246,7 +1239,7 @@ static bool get_links(struct ctl_dir *dir, static int insert_links(struct ctl_table_header *head) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; - struct ctl_dir *core_parent = NULL; + struct ctl_dir *core_parent; struct ctl_table_header *links; int err; diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 12901dcf57e2..f4616083faef 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -3,6 +3,7 @@ #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" /* * /proc/softirqs ... display the number of softirqs @@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v) static int __init proc_softirqs_init(void) { - proc_create_single("softirqs", 0, NULL, show_softirqs); + struct proc_dir_entry *pde; + + pde = proc_create_single("softirqs", 0, NULL, show_softirqs); + pde_make_permanent(pde); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4e0023643f8b..8a74cdcc9af0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> -#include <linux/vmacache.h> #include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> @@ -124,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -2UL; + vma = get_gate_vma(priv->mm); + } + + return vma; +} + static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) @@ -153,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-EINTR); } + vma_iter_init(&priv->iter, mm, last_addr); hold_task_mempolicy(priv); - priv->tail_vma = get_gate_vma(mm); - - vma = find_vma(mm, last_addr); - if (vma) - return vma; + if (last_addr == -2UL) + return get_gate_vma(mm); - return priv->tail_vma; + return proc_get_vma(priv, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - struct proc_maps_private *priv = m->private; - struct vm_area_struct *next, *vma = v; - - if (vma == priv->tail_vma) - next = NULL; - else if (vma->vm_next) - next = vma->vm_next; - else - next = priv->tail_vma; - - *ppos = next ? next->vm_start : -1UL; - - return next; + if (*ppos == -2UL) { + *ppos = -1UL; + return NULL; + } + return proc_get_vma(m->private, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -864,7 +867,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true, false)); + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); @@ -877,16 +880,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss; - struct mm_struct *mm; + struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; - unsigned long last_vma_end = 0; + unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; - mm = priv->mm; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; @@ -899,8 +902,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); + vma = mas_find(&mas, ULONG_MAX); + + if (unlikely(!vma)) + goto empty_set; - for (vma = priv->mm->mmap; vma;) { + vma_start = vma->vm_start; + do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; @@ -909,6 +917,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { + mas_pause(&mas); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -952,7 +961,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = find_vma(mm, last_vma_end - 1); + vma = mas_find(&mas, ULONG_MAX); /* Case 3 above */ if (!vma) break; @@ -966,11 +975,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v) smap_gather_stats(vma, &mss, last_vma_end); } /* Case 2 above */ - vma = vma->vm_next; - } + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); - show_vma_header_prefix(m, priv->mm->mmap->vm_start, - last_vma_end, 0, 0, 0, 0); +empty_set: + show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); @@ -1263,6 +1271,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + MA_STATE(mas, &mm->mm_mt, 0, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1282,7 +1291,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vma->vm_flags &= ~VM_SOFTDIRTY; @@ -1294,8 +1303,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, - &cp); + walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); @@ -1418,9 +1426,19 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); - if (pm->show_pfn) + if (pm->show_pfn) { + pgoff_t offset; + /* + * For PFN swap offsets, keeping the offset field + * to be PFN only to be compatible with old smaps. + */ + if (is_pfn_swap_entry(entry)) + offset = swp_offset_pfn(entry); + else + offset = swp_offset(entry); frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + (offset << MAX_SWAPFILES_SHIFT); + } flags |= PM_SWAP; migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) @@ -1477,7 +1495,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned long offset; if (pm->show_pfn) { - offset = swp_offset(entry) + + if (is_pfn_swap_entry(entry)) + offset = swp_offset_pfn(entry); + else + offset = swp_offset(entry); + offset = offset + ((addr & ~PMD_MASK) >> PAGE_SHIFT); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a6d21fc0033c..2fd06f52b6a4 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -20,15 +20,13 @@ */ void task_mem(struct seq_file *m, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long bytes = 0, sbytes = 0, slack = 0, size; - - mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + mmap_read_lock(mm); + for_each_vma(vmi, vma) { bytes += kobjsize(vma); region = vma->vm_region; @@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long task_vsize(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - struct rb_node *p; unsigned long vsize = 0; mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - } mmap_read_unlock(mm); return vsize; } @@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long size = kobjsize(mm); mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + for_each_vma(vmi, vma) { size += kobjsize(vma); region = vma->vm_region; if (region) { @@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) */ static int show_map(struct seq_file *m, void *_p) { - struct rb_node *p = _p; - - return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); + return nommu_vma_show(m, _p); } static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; struct mm_struct *mm; - struct rb_node *p; - loff_t n = *pos; + struct vm_area_struct *vma; + unsigned long addr = *pos; + + /* See m_next(). Zero at the start or after lseek. */ + if (addr == -1UL) + return NULL; /* pin the task and mm whilst we play with them */ priv->task = get_proc_task(priv->inode); @@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EINTR); } - /* start from the Nth VMA */ - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) - if (n-- == 0) - return p; + /* start the next element from addr */ + vma = find_vma(mm, addr); + if (vma) + return vma; mmap_read_unlock(mm); mmput(mm); @@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml) static void *m_next(struct seq_file *m, void *_p, loff_t *pos) { - struct rb_node *p = _p; + struct vm_area_struct *vma = _p; - (*pos)++; - return p ? rb_next(p) : NULL; + *pos = vma->vm_end; + return find_vma(vma->vm_mm, vma->vm_end); } static const struct seq_operations proc_pid_maps_ops = { diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index deb99bc9b7e6..b5343d209381 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -7,6 +7,7 @@ #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/kernel_stat.h> +#include "internal.h" static int uptime_proc_show(struct seq_file *m, void *v) { @@ -39,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v) static int __init proc_uptime_init(void) { - proc_create_single("uptime", 0, NULL, uptime_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("uptime", 0, NULL, uptime_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index b449f186577f..02e3c3cd4a9a 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -5,6 +5,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include "internal.h" static int version_proc_show(struct seq_file *m, void *v) { @@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v) static int __init proc_version_init(void) { - proc_create_single("version", 0, NULL, version_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("version", 0, NULL, version_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_version_init); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index b9895afca9d1..85b2fa3b211c 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -470,10 +470,8 @@ out2: out1: iput(sbi->inodes); out: - if (bh1) - brelse(bh1); - if (bh2) - brelse(bh2); + brelse(bh1); + brelse(bh2); outnobh: kfree(qs); s->s_fs_info = NULL; diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 5f2405994280..0f1493e0f6d0 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -71,6 +71,40 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) return ret; } +static inline int do_check_range(struct super_block *sb, const char *val_name, + uint val, uint min_val, uint max_val) +{ + if (val < min_val || val > max_val) { + quota_error(sb, "Getting %s %u out of range %u-%u", + val_name, val, min_val, max_val); + return -EUCLEAN; + } + + return 0; +} + +static int check_dquot_block_header(struct qtree_mem_dqinfo *info, + struct qt_disk_dqdbheader *dh) +{ + int err = 0; + + err = do_check_range(info->dqi_sb, "dqdh_next_free", + le32_to_cpu(dh->dqdh_next_free), 0, + info->dqi_blocks - 1); + if (err) + return err; + err = do_check_range(info->dqi_sb, "dqdh_prev_free", + le32_to_cpu(dh->dqdh_prev_free), 0, + info->dqi_blocks - 1); + if (err) + return err; + err = do_check_range(info->dqi_sb, "dqdh_entries", + le16_to_cpu(dh->dqdh_entries), 0, + qtree_dqstr_in_blk(info)); + + return err; +} + /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { @@ -85,6 +119,9 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info) ret = read_blk(info, blk, buf); if (ret < 0) goto out_buf; + ret = check_dquot_block_header(info, dh); + if (ret) + goto out_buf; info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); } else { @@ -232,6 +269,9 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, *err = read_blk(info, blk, buf); if (*err < 0) goto out_buf; + *err = check_dquot_block_header(info, dh); + if (*err) + goto out_buf; } else { blk = get_free_dqblk(info); if ((int)blk < 0) { @@ -313,6 +353,10 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } ref = (__le32 *)buf; newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + ret = do_check_range(dquot->dq_sb, "block", newblk, 0, + info->dqi_blocks - 1); + if (ret) + goto out_buf; if (!newblk) newson = 1; if (depth == info->dqi_qtree_depth - 1) { @@ -424,6 +468,9 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; + ret = check_dquot_block_header(info, dh); + if (ret) + goto out_buf; le16_add_cpu(&dh->dqdh_entries, -1); if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ ret = remove_free_dqentry(info, buf, blk); @@ -480,12 +527,10 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); - if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { - quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", - newblk, info->dqi_blocks); - ret = -EUCLEAN; + ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF, + info->dqi_blocks - 1); + if (ret) goto out_buf; - } if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); @@ -586,12 +631,10 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; - if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { - quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", - blk, info->dqi_blocks); - ret = -EUCLEAN; + ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF, + info->dqi_blocks - 1); + if (ret) goto out_buf; - } if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); @@ -705,15 +748,21 @@ static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, goto out_buf; } for (i = __get_index(info, *id, depth); i < epb; i++) { - if (ref[i] == cpu_to_le32(0)) { + uint blk_no = le32_to_cpu(ref[i]); + + if (blk_no == 0) { *id += level_inc; continue; } + ret = do_check_range(info->dqi_sb, "block", blk_no, 0, + info->dqi_blocks - 1); + if (ret) + goto out_buf; if (depth == info->dqi_qtree_depth - 1) { ret = 0; goto out_buf; } - ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1); + ret = find_next_id(info, id, blk_no, depth + 1); if (ret != -ENOENT) break; } diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ba3525ccc27e..cb240eac5036 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -203,9 +203,9 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long maxpages, lpages, nr, loop, ret; + unsigned long maxpages, lpages, nr_folios, loop, ret, nr_pages, pfn; struct inode *inode = file_inode(file); - struct page **pages = NULL, **ptr, *page; + struct folio_batch fbatch; loff_t isize; /* the mapping mustn't extend beyond the EOF */ @@ -221,31 +221,39 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, goto out; /* gang-find the pages */ - pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - goto out_free; - - nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); - if (nr != lpages) - goto out_free_pages; /* leave if some pages were missing */ + folio_batch_init(&fbatch); + nr_pages = 0; +repeat: + nr_folios = filemap_get_folios_contig(inode->i_mapping, &pgoff, + ULONG_MAX, &fbatch); + if (!nr_folios) { + ret = -ENOSYS; + return ret; + } + if (ret == -ENOSYS) { + ret = (unsigned long) folio_address(fbatch.folios[0]); + pfn = folio_pfn(fbatch.folios[0]); + } /* check the pages for physical adjacency */ - ptr = pages; - page = *ptr++; - page++; - for (loop = lpages; loop > 1; loop--) - if (*ptr++ != page++) - goto out_free_pages; + for (loop = 0; loop < nr_folios; loop++) { + if (pfn + nr_pages != folio_pfn(fbatch.folios[loop])) { + ret = -ENOSYS; + goto out_free; /* leave if not physical adjacent */ + } + nr_pages += folio_nr_pages(fbatch.folios[loop]); + if (nr_pages >= lpages) + goto out_free; /* successfully found desired pages*/ + } + if (nr_pages < lpages) { + folio_batch_release(&fbatch); + goto repeat; /* loop if pages are missing */ + } /* okay - all conditions fulfilled */ - ret = (unsigned long) page_address(pages[0]); -out_free_pages: - ptr = pages; - for (loop = nr; loop > 0; loop--) - put_page(*ptr++); out_free: - kfree(pages); + folio_batch_release(&fbatch); out: return ret; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index bc66d0173e33..b3257e852820 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -146,15 +146,15 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, } static int ramfs_tmpfile(struct user_namespace *mnt_userns, - struct inode *dir, struct dentry *dentry, umode_t mode) + struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); if (!inode) return -ENOSPC; - d_tmpfile(dentry, inode); - return 0; + d_tmpfile(file, inode); + return finish_open_simple(file, 0); } static const struct inode_operations ramfs_dir_inode_operations = { diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 94addfcefede..9f62da7471c9 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -868,7 +868,7 @@ loop_next: */ if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { spin_unlock(lock); - ll_rw_block(REQ_OP_WRITE, 1, &bh); + write_dirty_buffer(bh, 0); spin_lock(lock); } put_bh(bh); @@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s, if (tbh) { if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_WRITE, 1, &tbh); + write_dirty_buffer(tbh, 0); reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; @@ -2240,7 +2240,7 @@ abort_replay: } } /* read in the log blocks, memcpy to the corresponding real block */ - ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks); + bh_read_batch(get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(log_blocks[i]); @@ -2342,10 +2342,11 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, } else bhlist[j++] = bh; } - ll_rw_block(REQ_OP_READ, j, bhlist); + bh = bhlist[0]; + bh_read_nowait(bh, 0); + bh_readahead_batch(j - 1, &bhlist[1], 0); for (i = 1; i < j; i++) brelse(bhlist[i]); - bh = bhlist[0]; wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 30319dc33c18..84a194b77f19 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -456,7 +456,7 @@ static int print_internal(struct buffer_head *bh, int first, int last) to = B_NR_ITEMS(bh); } else { from = first; - to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh); + to = min_t(int, last, B_NR_ITEMS(bh)); } reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 4a7cb16e9345..3dba8acf4e83 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -411,7 +411,7 @@ int reiserfs_proc_info_init(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; @@ -441,7 +441,7 @@ int reiserfs_proc_info_done(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 8096c74c38ac..7b498a0d060b 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -97,7 +97,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) * using the copy_size var below allows this code to work for * both shrinking and expanding the FS. */ - copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr; + copy_size = min(bmap_nr_new, bmap_nr); copy_size = copy_size * sizeof(struct reiserfs_list_bitmap_node *); for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 9a293609a022..84c12a1947b2 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s, if (!buffer_uptodate(bh[j])) { if (depth == -1) depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j); + bh_readahead(bh[j], REQ_RAHEAD); } brelse(bh[j]); } @@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, if (!buffer_uptodate(bh) && depth == -1) depth = reiserfs_write_unlock_nested(sb); - ll_rw_block(REQ_OP_READ, 1, &bh); + bh_read_nowait(bh, 0); wait_on_buffer(bh); if (depth != -1) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c88cd2ce0665..929acce6e731 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1702,9 +1702,7 @@ static int read_super_block(struct super_block *s, int offset) /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { - ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s)); - wait_on_buffer(SB_BUFFER_WITH_SB(s)); - if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { + if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); return 1; } @@ -2504,9 +2502,7 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, len = i_size - off; toread = len; while (toread > 0) { - tocopy = - sb->s_blocksize - offset < - toread ? sb->s_blocksize - offset : toread; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; /* * Quota files are without tails so we can safely @@ -2554,8 +2550,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, return -EIO; } while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; reiserfs_write_lock(sb); err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 2cab413fffee..7d605db3bb3b 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -1101,7 +1101,11 @@ struct smb2_change_notify_rsp { #define SMB2_CREATE_REQUEST_LEASE "RqLs" #define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" -#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" +#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" +#define SMB2_CREATE_APP_INSTANCE_ID "\x45\xBC\xA6\x6A\xEF\xA7\xF7\x4A\x90\x08\xFA\x46\x2E\x14\x4D\x74" +#define SMB2_CREATE_APP_INSTANCE_VERSION "\xB9\x82\xD0\xB7\x3B\x56\x07\x4F\xA0\x7B\x52\x4A\x81\x16\xA0\x10" +#define SVHDX_OPEN_DEVICE_CONTEXT "\x9C\xCB\xCF\x9E\x04\xC1\xE6\x43\x98\x0E\x15\x8D\xA1\xF6\xEC\x83" +#define SMB2_CREATE_TAG_AAPL "AAPL" /* Flag (SMB3 open response) values */ #define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e56510964b22..8ba8c4c50770 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -506,8 +506,9 @@ static int squashfs_readahead_fragment(struct page **page, squashfs_i(inode)->fragment_size); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + int error = buffer->error; - if (buffer->error) + if (error) goto out; expected += squashfs_i(inode)->fragment_offset; @@ -529,7 +530,7 @@ static int squashfs_readahead_fragment(struct page **page, out: squashfs_cache_put(buffer); - return buffer->error; + return error; } static void squashfs_readahead(struct readahead_control *ractl) @@ -557,6 +558,13 @@ static void squashfs_readahead(struct readahead_control *ractl) int res, bsize; u64 block = 0; unsigned int expected; + struct page *last_page; + + expected = start >> msblk->block_log == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + max_pages = (expected + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = __readahead_batch(ractl, pages, max_pages); if (!nr_pages) @@ -566,13 +574,10 @@ static void squashfs_readahead(struct readahead_control *ractl) goto skip_pages; index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) goto skip_pages; - expected = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - if (index == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, @@ -593,15 +598,15 @@ static void squashfs_readahead(struct readahead_control *ractl) res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - squashfs_page_actor_free(actor); + last_page = squashfs_page_actor_free(actor); if (res == expected) { int bytes; /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (pages[nr_pages - 1]->index == file_end && bytes) - memzero_page(pages[nr_pages - 1], bytes, + if (index == file_end && bytes && last_page) + memzero_page(last_page, bytes, PAGE_SIZE - bytes); for (i = 0; i < nr_pages; i++) { diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 54b93bf4a25c..81af6c4ca115 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -71,11 +71,13 @@ static void *handle_next_page(struct squashfs_page_actor *actor) (actor->next_index != actor->page[actor->next_page]->index)) { actor->next_index++; actor->returned_pages++; + actor->last_page = NULL; return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM); } actor->next_index++; actor->returned_pages++; + actor->last_page = actor->page[actor->next_page]; return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); } @@ -125,6 +127,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->returned_pages = 0; actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->last_page = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 95ffbb543d91..97d4983559b1 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -16,6 +16,7 @@ struct squashfs_page_actor { void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); + struct page *last_page; int pages; int length; int next_page; @@ -29,10 +30,13 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, struct page **page, int pages, int length); -static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor) +static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { + struct page *last_page = actor->last_page; + kfree(actor->tmp_buffer); kfree(actor); + return last_page; } static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { diff --git a/fs/super.c b/fs/super.c index 6a82660e1adb..8d39e4f11cfa 100644 --- a/fs/super.c +++ b/fs/super.c @@ -291,6 +291,7 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); + fscrypt_destroy_keyring(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -479,7 +480,7 @@ void generic_shutdown_super(struct super_block *sb) evict_inodes(sb); /* only nonzero refcount inodes can have marks */ fsnotify_sb_delete(sb); - fscrypt_sb_delete(sb); + fscrypt_destroy_keyring(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index c57b46a352d8..3125e76376ee 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; } +/** + * ubifs_encrypt - Encrypt data. + * @inode: inode which refers to the data node + * @dn: data node to encrypt + * @in_len: length of data to be compressed + * @out_len: allocated memory size for the data area of @dn + * @block: logical block number of the block + * + * This function encrypt a possibly-compressed data in the data node. + * The encrypted data length will store in @out_len. + */ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index fc718f6178f2..3f128b9fdfbb 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2467,7 +2467,7 @@ error_dump: static inline int chance(unsigned int n, unsigned int out_of) { - return !!((prandom_u32() % out_of) + 1 <= n); + return !!(prandom_u32_max(out_of) + 1 <= n); } @@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) if (chance(1, 2)) { d->pc_delay = 1; /* Fail within 1 minute */ - delay = prandom_u32() % 60000; + delay = prandom_u32_max(60000); d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); ubifs_warn(c, "failing after %lums", delay); } else { d->pc_delay = 2; - delay = prandom_u32() % 10000; + delay = prandom_u32_max(10000); /* Fail within 10000 operations */ d->pc_cnt_max = delay; ubifs_warn(c, "failing after %lu calls", delay); @@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = prandom_u32() % len; + from = prandom_u32_max(len); /* Corruption span max to end of write unit */ to = min(len, ALIGN(from + 1, c->max_write_size)); @@ -2581,7 +2581,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, if (ffs) memset(p + from, 0xFF, to - from); else - prandom_bytes(p + from, to - from); + get_random_bytes(p + from, to - from); return to; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 86151889548e..0f29cf201136 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @c: UBIFS file-system description object * @dir: parent directory inode * @mode: inode mode flags + * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and * initializes it. Returns new inode in case of success and an error code in * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode) + umode_t mode, bool is_xattr) { int err; struct inode *inode; @@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; - err = fscrypt_prepare_new_inode(dir, inode, &encrypted); - if (err) { - ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); - goto out_iput; + if (!is_xattr) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } } switch (mode & S_IFMT) { @@ -309,7 +312,7 @@ static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -370,7 +373,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) if (err) return ERR_PTR(err); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_free; @@ -424,8 +427,9 @@ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) } static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, @@ -462,7 +466,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; } - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; @@ -475,7 +479,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); ubifs_assert(c, ui->dirty); instantiated = 1; @@ -489,7 +493,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, ubifs_release_budget(c, &req); - return 0; + return finish_open_simple(file, 0); out_cancel: unlock_2_inodes(dir, inode); @@ -872,7 +876,7 @@ out_fname: } /** - * check_dir_empty - check if a directory is empty or not. + * ubifs_check_dir_empty - check if a directory is empty or not. * @dir: VFS inode object of the directory to check * * This function checks if directory @dir is empty. Returns zero if the @@ -1004,7 +1008,7 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -1091,7 +1095,7 @@ static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); @@ -1173,7 +1177,7 @@ static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 75dab0ae3939..d02509920baf 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -503,7 +503,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) { if (c->double_hash) - dent->cookie = (__force __le32) prandom_u32(); + dent->cookie = (__force __le32) get_random_u32(); else dent->cookie = 0; } @@ -1472,23 +1472,25 @@ out_free: * @block: data block number * @dn: data node to re-compress * @new_len: new length + * @dn_size: size of the data node @dn in memory * * This function is used when an inode is truncated and the last data node of * the inode has to be re-compressed/encrypted and re-written. */ static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, unsigned int block, struct ubifs_data_node *dn, - int *new_len) + int *new_len, int dn_size) { void *buf; - int err, dlen, compr_type, out_len, old_dlen; + int err, dlen, compr_type, out_len, data_size; out_len = le32_to_cpu(dn->size); buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; - dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + data_size = dn_size - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); if (IS_ENCRYPTED(inode)) { @@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in } if (IS_ENCRYPTED(inode)) { - err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + err = ubifs_encrypt(inode, dn, out_len, &data_size, block); if (err) goto out; - out_len = old_dlen; + out_len = data_size; } else { dn->compr_size = 0; } @@ -1550,6 +1552,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, struct ubifs_trun_node *trun; struct ubifs_data_node *dn; int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + int dn_size; struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; unsigned int blk; @@ -1562,10 +1565,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_assert(c, S_ISREG(inode->i_mode)); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); - sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + - UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + dn_size = COMPRESSED_DATA_NODE_BUF_SZ; - sz += ubifs_auth_node_sz(c); + if (IS_ENCRYPTED(inode)) + dn_size += UBIFS_CIPHER_BLOCK_SIZE; + + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + dn_size + ubifs_auth_node_sz(c); ino = kmalloc(sz, GFP_NOFS); if (!ino) @@ -1596,15 +1602,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - - UBIFS_TRUN_NODE_SZ); + ubifs_dump_node(c, dn, dn_size); goto out_free; } if (dn_len <= dlen) dlen = 0; /* Nothing to do */ else { - err = truncate_data_node(c, inode, blk, dn, &dlen); + err = truncate_data_node(c, inode, blk, dn, + &dlen, dn_size); if (err) goto out_free; } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index d76a19e460cd..cfbc31f709f4 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c) if (!dbg_is_chk_gen(c)) return 0; - if (prandom_u32() & 3) + if (prandom_u32_max(4)) return 0; for (i = 0; i < c->lsave_cnt; i++) c->lsave[i] = c->main_first; list_for_each_entry(lprops, &c->empty_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->freeable_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->frdi_idx_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_DIRTY - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_FREE - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; return 1; } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 58c92c96ecef..01362ad5f804 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_is_chk_index(c) && !(prandom_u32() & 7)) + if (dbg_is_chk_index(c) && !prandom_u32_max(8)) return -ENOSPC; return 0; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 7d6d2f152e03..478bbbb5382f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2026,7 +2026,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode); + umode_t mode, bool is_xattr); int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e4c4761aff7f..3db8486e3725 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (err) return err; - inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index cad3772f9dbe..be640f4b2f2c 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); + bh_readahead_batch(num, bha, REQ_RAHEAD); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index a2adf6293093..16bcf2c6b8b3 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); + bh_readahead_batch(num, bha, REQ_RAHEAD); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/file.c b/fs/udf/file.c index 09aef77269fe..5c659e23e578 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -252,6 +252,7 @@ const struct file_operations udf_file_operations = { .release = udf_release_file, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 8d06daed549f..dce6ae9ae306 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1211,13 +1211,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, if (!bh) return NULL; - if (buffer_uptodate(bh)) - return bh; - - ll_rw_block(REQ_OP_READ, 1, &bh); - - wait_on_buffer(bh); - if (buffer_uptodate(bh)) + if (bh_read(bh, 0) >= 0) return bh; brelse(bh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index b3d5f97f16cd..fb4c30e05245 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -626,7 +626,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, } static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); @@ -640,9 +640,9 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); - return 0; + return finish_open_simple(file, 0); } static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index bd810d8239f2..2436e3f82147 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -295,14 +295,10 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, oldb + pos); - if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - ufs_error(inode->i_sb, __func__, - "read of block failed\n"); - break; - } + if (bh_read(bh, 0) < 0) { + ufs_error(inode->i_sb, __func__, + "read of block failed\n"); + break; } UFSD(" change from %llu to %llu, pos %u\n", diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0c1d33c4f74c..07c81ab3fd4d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -30,6 +30,7 @@ #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/swapops.h> +#include <linux/miscdevice.h> int sysctl_unprivileged_userfaultfd __read_mostly; @@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; - if ((vmf->flags & FAULT_FLAG_USER) == 0 && - ctx->flags & UFFD_USER_MODE_ONLY) { - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " - "sysctl knob to 1 if kernel faults must be handled " - "without obtaining CAP_SYS_PTRACE capability\n"); + if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) goto out; - } /* * If it's already released don't get it. This avoids to loop @@ -615,14 +611,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, if (release_new_ctx) { struct vm_area_struct *vma; struct mm_struct *mm = release_new_ctx->mm; + VMA_ITERATOR(vmi, mm, 0); /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~__VM_UFFD_FLAGS; } + } mmap_write_unlock(mm); userfaultfd_ctx_put(release_new_ctx); @@ -803,11 +801,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *unmaps) +int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *unmaps) { - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + VMA_ITERATOR(vmi, mm, start); + struct vm_area_struct *vma; + + for_each_vma_range(vmi, vma, end) { struct userfaultfd_unmap_ctx *unmap_ctx; struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; @@ -857,6 +857,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + MA_STATE(mas, &mm->mm_mt, 0, 0); WRITE_ONCE(ctx->released, true); @@ -873,7 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -887,10 +888,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) + if (prev) { + mas_pause(&mas); vma = prev; - else + } else { prev = vma; + } + vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1272,6 +1276,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; + MA_STATE(mas, &mm->mm_mt, 0, 0); user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1314,7 +1319,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1339,7 +1345,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1399,8 +1405,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1430,6 +1438,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto next; } @@ -1437,11 +1447,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = split_vma(mm, vma, start, 1); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } next: /* @@ -1458,8 +1472,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1503,6 +1517,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; + MA_STATE(mas, &mm->mm_mt, 0, 0); ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1521,7 +1536,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1546,7 +1562,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, */ found = false; ret = -EINVAL; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1566,8 +1582,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1636,8 +1654,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -2056,20 +2074,11 @@ static void init_once_userfaultfd_ctx(void *mem) seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); } -SYSCALL_DEFINE1(userfaultfd, int, flags) +static int new_userfaultfd(int flags) { struct userfaultfd_ctx *ctx; int fd; - if (!sysctl_unprivileged_userfaultfd && - (flags & UFFD_USER_MODE_ONLY) == 0 && - !capable(CAP_SYS_PTRACE)) { - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " - "sysctl knob to 1 if kernel faults must be handled " - "without obtaining CAP_SYS_PTRACE capability\n"); - return -EPERM; - } - BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ @@ -2102,8 +2111,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) return fd; } +static inline bool userfaultfd_syscall_allowed(int flags) +{ + /* Userspace-only page faults are always allowed */ + if (flags & UFFD_USER_MODE_ONLY) + return true; + + /* + * The user is requesting a userfaultfd which can handle kernel faults. + * Privileged users are always allowed to do this. + */ + if (capable(CAP_SYS_PTRACE)) + return true; + + /* Otherwise, access to kernel fault handling is sysctl controlled. */ + return sysctl_unprivileged_userfaultfd; +} + +SYSCALL_DEFINE1(userfaultfd, int, flags) +{ + if (!userfaultfd_syscall_allowed(flags)) + return -EPERM; + + return new_userfaultfd(flags); +} + +static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) +{ + if (cmd != USERFAULTFD_IOC_NEW) + return -EINVAL; + + return new_userfaultfd(flags); +} + +static const struct file_operations userfaultfd_dev_fops = { + .unlocked_ioctl = userfaultfd_dev_ioctl, + .compat_ioctl = userfaultfd_dev_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice userfaultfd_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "userfaultfd", + .fops = &userfaultfd_dev_fops +}; + static int __init userfaultfd_init(void) { + int ret; + + ret = misc_register(&userfaultfd_misc); + if (ret) + return ret; + userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", sizeof(struct userfaultfd_ctx), 0, diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 517a138faa66..191b22b9a35b 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -133,6 +133,21 @@ xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) return true; } +static inline bool +xfs_verify_agbext( + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_agblock_t len) +{ + if (agbno + len <= agbno) + return false; + + if (!xfs_verify_agbno(pag, agbno)) + return false; + + return xfs_verify_agbno(pag, agbno + len - 1); +} + /* * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata. diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index e2bdf089c0a3..de79f5d07f65 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -263,11 +263,7 @@ xfs_alloc_get_rec( goto out_bad_rec; /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(pag, *bno)) - goto out_bad_rec; - if (*bno > *bno + *len) - goto out_bad_rec; - if (!xfs_verify_agbno(pag, *bno + *len - 1)) + if (!xfs_verify_agbext(pag, *bno, *len)) goto out_bad_rec; return 0; @@ -1520,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock( #ifdef DEBUG /* Randomly don't execute the first algorithm. */ - if (prandom_u32() & 1) + if (prandom_u32_max(2)) return 0; #endif diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e56723dc9cd5..49d0d4ea63fc 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -294,7 +294,7 @@ xfs_check_block( else thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); xfs_err(mp, "%s: ptrs are equal in node\n", diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e7201dc68f43..e576560b46e9 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2192,8 +2192,8 @@ xfs_da_grow_inode_int( */ mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { - nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); + nmap = min(XFS_BMAP_MAX_NMAP, c); error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->total, &mapp[mapi], &nmap); diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 76eedc2756b3..92bac3373f1f 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -261,7 +261,7 @@ xfs_dir_createname( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -357,7 +357,7 @@ xfs_dir_lookup( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -435,7 +435,7 @@ xfs_dir_removename( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); @@ -493,7 +493,7 @@ xfs_dir_replace( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -610,19 +610,23 @@ xfs_dir2_grow_inode( int xfs_dir2_isblock( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isblock) { - xfs_fileoff_t last; /* last file offset */ - int rval; + struct xfs_mount *mp = args->dp->i_mount; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - if (XFS_IS_CORRUPT(args->dp->i_mount, - rval != 0 && - args->dp->i_disk_size != args->geo->blksize)) + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isblock = false; + if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) + return 0; + + *isblock = true; + if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) return -EFSCORRUPTED; - *vp = rval; return 0; } @@ -632,14 +636,20 @@ xfs_dir2_isblock( int xfs_dir2_isleaf( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isleaf) { - xfs_fileoff_t last; /* last file offset */ - int rval; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - *vp = last == args->geo->leafblk + args->geo->fsbcount; + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isleaf = false; + if (eof != args->geo->leafblk + args->geo->fsbcount) + return 0; + + *isleaf = true; return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index b6df3c34b26a..dd39f17dd9a9 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -61,8 +61,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index d9b66306a9a7..cb9e950a911d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -146,6 +146,8 @@ xfs_dir3_leaf_check_int( xfs_dir2_leaf_tail_t *ltp; int stale; int i; + bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC); ltp = xfs_dir2_leaf_tail_p(geo, leaf); @@ -158,8 +160,7 @@ xfs_dir3_leaf_check_int( return __this_address; /* Leaves and bests don't overlap in leaf format. */ - if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || - hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + if (isleaf1 && (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address; @@ -175,6 +176,10 @@ xfs_dir3_leaf_check_int( } if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; + if (isleaf1 && xfs_dir2_dataptr_to_db(geo, + be32_to_cpu(hdr->ents[i].address)) >= + be32_to_cpu(ltp->bestcount)) + return __this_address; } if (hdr->stale != stale) return __this_address; diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 003812fd7d35..8cd37e6e9d38 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -865,7 +865,6 @@ xfs_dir2_sf_lookup( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ - int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ @@ -929,8 +928,7 @@ xfs_dir2_sf_lookup( if (!ci_sfep) return -ENOENT; /* otherwise process the CI match as required by the caller */ - error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); - return error; + return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); } /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b55bdfa9c8a8..371dc07233e0 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1564,20 +1564,6 @@ struct xfs_rmap_rec { #define RMAPBT_UNUSED_OFFSET_BITLEN 7 #define RMAPBT_OFFSET_BITLEN 54 -#define XFS_RMAP_ATTR_FORK (1 << 0) -#define XFS_RMAP_BMBT_BLOCK (1 << 1) -#define XFS_RMAP_UNWRITTEN (1 << 2) -#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ - XFS_RMAP_BMBT_BLOCK) -#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) -struct xfs_rmap_irec { - xfs_agblock_t rm_startblock; /* extent start block */ - xfs_extlen_t rm_blockcount; /* extent length */ - uint64_t rm_owner; /* extent owner */ - uint64_t rm_offset; /* offset within the owner */ - unsigned int rm_flags; /* state flags */ -}; - /* * Key structure * @@ -1626,7 +1612,7 @@ unsigned int xfs_refc_block(struct xfs_mount *mp); * on the startblock. This speeds up mount time deletion of stale * staging extents because they're all at the right side of the tree. */ -#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31)) +#define XFS_REFC_COWFLAG (1U << 31) #define REFCNTBT_COWFLAG_BITLEN 1 #define REFCNTBT_AGBLOCK_BITLEN 31 @@ -1640,12 +1626,6 @@ struct xfs_refcount_key { __be32 rc_startblock; /* starting block number */ }; -struct xfs_refcount_irec { - xfs_agblock_t rc_startblock; /* starting block number */ - xfs_extlen_t rc_blockcount; /* count of free blocks */ - xfs_nlink_t rc_refcount; /* number of inodes linked here */ -}; - #define MAXREFCOUNT ((xfs_nlink_t)~0U) #define MAXREFCEXTLEN ((xfs_extlen_t)~0U) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 6cdfd64bc56b..94db50eb706a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc( /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) - do_sparse = prandom_u32() & 1; + do_sparse = prandom_u32_max(2); #endif /* @@ -805,7 +805,7 @@ sparse_alloc: * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, - args.agbno, args.len, prandom_u32()); + args.agbno, args.len, get_random_u32()); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9327a4f39206..6b21760184d9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -78,7 +78,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %zd).", + "corrupt inode %llu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -192,7 +192,7 @@ xfs_iformat_btree( XFS_DFORK_SIZE(dip, mp, whichfork) || ifp->if_nextents > ip->i_nblocks) || level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { - xfs_warn(mp, "corrupt inode %Lu (btree).", + xfs_warn(mp, "corrupt inode %llu (btree).", (unsigned long long) ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_btree", dfp, size, diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index b351b9dc6561..f13e0809dc63 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -613,25 +613,49 @@ typedef struct xfs_efi_log_format { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[1]; /* array of extents to free */ + xfs_extent_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_t; +static inline size_t +xfs_efi_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[1]; /* array of extents to free */ + xfs_extent_32_t efi_extents[]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; +static inline size_t +xfs_efi_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[1]; /* array of extents to free */ + xfs_extent_64_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_64_t; +static inline size_t +xfs_efi_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * This is the structure used to lay out an efd log item in the * log. The efd_extents array is a variable size array whose @@ -642,25 +666,49 @@ typedef struct xfs_efd_log_format { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[1]; /* array of extents freed */ + xfs_extent_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_t; +static inline size_t +xfs_efd_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[1]; /* array of extents freed */ + xfs_extent_32_t efd_extents[]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; +static inline size_t +xfs_efd_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[1]; /* array of extents freed */ + xfs_extent_64_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_64_t; +static inline size_t +xfs_efd_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * RUI/RUD (reverse mapping) log format definitions */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 64b910caafaa..3f34bafe18dd 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -46,13 +46,16 @@ STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, int xfs_refcount_lookup_le( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); } @@ -63,13 +66,16 @@ xfs_refcount_lookup_le( int xfs_refcount_lookup_ge( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } @@ -80,13 +86,16 @@ xfs_refcount_lookup_ge( int xfs_refcount_lookup_eq( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); } @@ -96,7 +105,17 @@ xfs_refcount_btrec_to_irec( const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) { - irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); + uint32_t start; + + start = be32_to_cpu(rec->refc.rc_startblock); + if (start & XFS_REFC_COWFLAG) { + start &= ~XFS_REFC_COWFLAG; + irec->rc_domain = XFS_REFC_DOMAIN_COW; + } else { + irec->rc_domain = XFS_REFC_DOMAIN_SHARED; + } + + irec->rc_startblock = start; irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); } @@ -114,7 +133,6 @@ xfs_refcount_get_rec( struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; - xfs_agblock_t realstart; error = xfs_btree_get_rec(cur, &rec, stat); if (error || !*stat) @@ -124,22 +142,11 @@ xfs_refcount_get_rec( if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; - /* handle special COW-staging state */ - realstart = irec->rc_startblock; - if (realstart & XFS_REFC_COW_START) { - if (irec->rc_refcount != 1) - goto out_bad_rec; - realstart &= ~XFS_REFC_COW_START; - } else if (irec->rc_refcount < 2) { + if (!xfs_refcount_check_domain(irec)) goto out_bad_rec; - } /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(pag, realstart)) - goto out_bad_rec; - if (realstart > realstart + irec->rc_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) goto out_bad_rec; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) @@ -169,12 +176,17 @@ xfs_refcount_update( struct xfs_refcount_irec *irec) { union xfs_btree_rec rec; + uint32_t start; int error; trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); - rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec.refc.rc_startblock = cpu_to_be32(start); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); + error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, @@ -196,9 +208,12 @@ xfs_refcount_insert( int error; trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; + cur->bc_rec.rc.rc_domain = irec->rc_domain; + error = xfs_btree_insert(cur, i); if (error) goto out_error; @@ -244,7 +259,8 @@ xfs_refcount_delete( } if (error) goto out_error; - error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); + error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock, + &found_rec); out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, @@ -343,6 +359,7 @@ xfs_refc_next( STATIC int xfs_refcount_split_extent( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t agbno, bool *shape_changed) { @@ -351,7 +368,7 @@ xfs_refcount_split_extent( int error; *shape_changed = false; - error = xfs_refcount_lookup_le(cur, agbno, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec); if (error) goto out_error; if (!found_rec) @@ -364,6 +381,8 @@ xfs_refcount_split_extent( error = -EFSCORRUPTED; goto out_error; } + if (rcext.rc_domain != domain) + return 0; if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) return 0; @@ -415,6 +434,9 @@ xfs_refcount_merge_center_extents( trace_xfs_refcount_merge_center_extents(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, center, right); + ASSERT(left->rc_domain == center->rc_domain); + ASSERT(right->rc_domain == center->rc_domain); + /* * Make sure the center and right extents are not in the btree. * If the center extent was synthesized, the first delete call @@ -423,8 +445,8 @@ xfs_refcount_merge_center_extents( * call removes the center and the second one removes the right * extent. */ - error = xfs_refcount_lookup_ge(cur, center->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_ge(cur, center->rc_domain, + center->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -451,8 +473,8 @@ xfs_refcount_merge_center_extents( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -491,10 +513,12 @@ xfs_refcount_merge_left_extent( trace_xfs_refcount_merge_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft); + ASSERT(left->rc_domain == cleft->rc_domain); + /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cleft->rc_domain, + cleft->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -512,8 +536,8 @@ xfs_refcount_merge_left_extent( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -552,13 +576,15 @@ xfs_refcount_merge_right_extent( trace_xfs_refcount_merge_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right); + ASSERT(right->rc_domain == cright->rc_domain); + /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, * remove it. */ if (cright->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cright->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cright->rc_domain, + cright->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -576,8 +602,8 @@ xfs_refcount_merge_right_extent( } /* Enlarge the right extent. */ - error = xfs_refcount_lookup_le(cur, right->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, right->rc_domain, + right->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -600,8 +626,6 @@ out_error: return error; } -#define XFS_FIND_RCEXT_SHARED 1 -#define XFS_FIND_RCEXT_COW 2 /* * Find the left extent and the one after it (cleft). This function assumes * that we've already split any extent crossing agbno. @@ -611,16 +635,16 @@ xfs_refcount_find_left_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *left, struct xfs_refcount_irec *cleft, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec); if (error) goto out_error; if (!found_rec) @@ -634,11 +658,9 @@ xfs_refcount_find_left_extents( goto out_error; } - if (xfs_refc_next(&tmp) != agbno) - return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (xfs_refc_next(&tmp) != agbno) return 0; /* We have a left extent; retrieve (or invent) the next right one */ *left = tmp; @@ -655,6 +677,9 @@ xfs_refcount_find_left_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp starts at the end of our range, just use that */ if (tmp.rc_startblock == agbno) *cleft = tmp; @@ -671,8 +696,10 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = min(aglen, tmp.rc_startblock - agbno); cleft->rc_refcount = 1; + cleft->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -680,6 +707,7 @@ xfs_refcount_find_left_extents( cleft->rc_startblock = agbno; cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; + cleft->rc_domain = domain; } trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft, agbno); @@ -700,16 +728,16 @@ xfs_refcount_find_right_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); + error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec); if (error) goto out_error; if (!found_rec) @@ -723,11 +751,9 @@ xfs_refcount_find_right_extents( goto out_error; } - if (tmp.rc_startblock != agbno + aglen) - return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (tmp.rc_startblock != agbno + aglen) return 0; /* We have a right extent; retrieve (or invent) the next left one */ *right = tmp; @@ -744,6 +770,9 @@ xfs_refcount_find_right_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp ends at the end of our range, just use that */ if (xfs_refc_next(&tmp) == agbno + aglen) *cright = tmp; @@ -760,8 +789,10 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = right->rc_startblock - cright->rc_startblock; cright->rc_refcount = 1; + cright->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -769,6 +800,7 @@ xfs_refcount_find_right_extents( cright->rc_startblock = agbno; cright->rc_blockcount = aglen; cright->rc_refcount = 1; + cright->rc_domain = domain; } trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right, agbno + aglen); @@ -794,10 +826,10 @@ xfs_refc_valid( STATIC int xfs_refcount_merge_extents( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t *agbno, xfs_extlen_t *aglen, enum xfs_refc_adjust_op adjust, - int flags, bool *shape_changed) { struct xfs_refcount_irec left = {0}, cleft = {0}; @@ -812,12 +844,12 @@ xfs_refcount_merge_extents( * just below (agbno + aglen) [cright], and just above (agbno + aglen) * [right]. */ - error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, - *aglen, flags); + error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain, + *agbno, *aglen); if (error) return error; - error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, - *aglen, flags); + error = xfs_refcount_find_right_extents(cur, &right, &cright, domain, + *agbno, *aglen); if (error) return error; @@ -870,7 +902,7 @@ xfs_refcount_merge_extents( aglen); } - return error; + return 0; } /* @@ -933,7 +965,8 @@ xfs_refcount_adjust_extents( if (*aglen == 0) return 0; - error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno, + &found_rec); if (error) goto out_error; @@ -941,10 +974,11 @@ xfs_refcount_adjust_extents( error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; - if (!found_rec) { + if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_SHARED; } /* @@ -957,6 +991,8 @@ xfs_refcount_adjust_extents( tmp.rc_blockcount = min(*aglen, ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; + tmp.rc_domain = XFS_REFC_DOMAIN_SHARED; + trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &tmp); @@ -986,15 +1022,30 @@ xfs_refcount_adjust_extents( (*agbno) += tmp.rc_blockcount; (*aglen) -= tmp.rc_blockcount; - error = xfs_refcount_lookup_ge(cur, *agbno, + /* Stop if there's nothing left to modify */ + if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) + break; + + /* Move the cursor to the start of ext. */ + error = xfs_refcount_lookup_ge(cur, + XFS_REFC_DOMAIN_SHARED, *agbno, &found_rec); if (error) goto out_error; } - /* Stop if there's nothing left to modify */ - if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) - break; + /* + * A previous step trimmed agbno/aglen such that the end of the + * range would not be in the middle of the record. If this is + * no longer the case, something is seriously wrong with the + * btree. Make sure we never feed the synthesized record into + * the processing loop below. + */ + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) || + XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) { + error = -EFSCORRUPTED; + goto out_error; + } /* * Adjust the reference count and either update the tree @@ -1070,13 +1121,15 @@ xfs_refcount_adjust( /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno, &shape_changed); if (error) goto out_error; if (shape_changed) shape_changes++; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno + aglen, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1085,8 +1138,8 @@ xfs_refcount_adjust( /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, - XFS_FIND_RCEXT_SHARED, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, + new_agbno, new_aglen, adj, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1125,6 +1178,32 @@ xfs_refcount_finish_one_cleanup( } /* + * Set up a continuation a deferred refcount operation by updating the intent. + * Checks to make sure we're not going to run off the end of the AG. + */ +static inline int +xfs_refcount_continue_op( + struct xfs_btree_cur *cur, + xfs_fsblock_t startblock, + xfs_agblock_t new_agbno, + xfs_extlen_t new_len, + xfs_fsblock_t *new_fsbno) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; + + if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) + return -EFSCORRUPTED; + + *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + + ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); + ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); + + return 0; +} + +/* * Process one of the deferred refcount operations. We pass back the * btree cursor to maintain our lock on the btree between calls. * This saves time and eliminates a buffer deadlock between the @@ -1191,12 +1270,20 @@ xfs_refcount_finish_one( case XFS_REFCOUNT_INCREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_INCREASE); - *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_DECREASE); - *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_ALLOC_COW: *new_fsb = startblock + blockcount; @@ -1307,7 +1394,8 @@ xfs_refcount_find_shared( *flen = 0; /* Try to find a refcount extent that crosses the start */ - error = xfs_refcount_lookup_le(cur, agbno, &have); + error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno, + &have); if (error) goto out_error; if (!have) { @@ -1325,6 +1413,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; /* If the extent ends before the start, look at the next one */ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { @@ -1340,6 +1430,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; } /* If the extent starts after the range we want, bail out */ @@ -1371,7 +1463,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } - if (tmp.rc_startblock >= agbno + aglen || + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED || + tmp.rc_startblock >= agbno + aglen || tmp.rc_startblock != *fbno + *flen) break; *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); @@ -1455,17 +1548,23 @@ xfs_refcount_adjust_cow_extents( return 0; /* Find any overlapping refcount records */ - error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno, + &found_rec); if (error) goto out_error; error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec && + ext.rc_domain != XFS_REFC_DOMAIN_COW)) { + error = -EFSCORRUPTED; + goto out_error; + } if (!found_rec) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + - XFS_REFC_COW_START; + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_COW; } switch (adj) { @@ -1480,6 +1579,8 @@ xfs_refcount_adjust_cow_extents( tmp.rc_startblock = agbno; tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; + tmp.rc_domain = XFS_REFC_DOMAIN_COW; + trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &tmp); @@ -1542,24 +1643,24 @@ xfs_refcount_adjust_cow( bool shape_changed; int error; - agbno += XFS_REFC_COW_START; - /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno, &shape_changed); if (error) goto out_error; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno + aglen, &shape_changed); if (error) goto out_error; /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, - XFS_FIND_RCEXT_COW, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno, + &aglen, adj, &shape_changed); if (error) goto out_error; @@ -1666,10 +1767,18 @@ xfs_refcount_recover_extent( be32_to_cpu(rec->refc.rc_refcount) != 1)) return -EFSCORRUPTED; - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); + rr = kmalloc(sizeof(struct xfs_refcount_recovery), + GFP_KERNEL | __GFP_NOFAIL); + INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - list_add_tail(&rr->rr_list, debris); + if (XFS_IS_CORRUPT(cur->bc_mp, + rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { + kfree(rr); + return -EFSCORRUPTED; + } + + list_add_tail(&rr->rr_list, debris); return 0; } @@ -1687,10 +1796,11 @@ xfs_refcount_recover_cow_leftovers( union xfs_btree_irec low; union xfs_btree_irec high; xfs_fsblock_t fsb; - xfs_agblock_t agbno; int error; - if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ + BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); + if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) return -EOPNOTSUPP; INIT_LIST_HEAD(&debris); @@ -1717,7 +1827,7 @@ xfs_refcount_recover_cow_leftovers( /* Find all the leftover CoW staging extents. */ memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); - low.rc.rc_startblock = XFS_REFC_COW_START; + low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW; high.rc.rc_startblock = -1U; error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); @@ -1738,8 +1848,8 @@ xfs_refcount_recover_cow_leftovers( &rr->rr_rrec); /* Free the orphan record */ - agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; - fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno); + fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, + rr->rr_rrec.rc_startblock); xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); @@ -1751,7 +1861,7 @@ xfs_refcount_recover_cow_leftovers( goto out_free; list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; @@ -1761,7 +1871,7 @@ out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; } @@ -1770,6 +1880,7 @@ out_free: int xfs_refcount_has_record( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, xfs_extlen_t len, bool *exists) @@ -1781,6 +1892,7 @@ xfs_refcount_has_record( low.rc.rc_startblock = bno; memset(&high, 0xFF, sizeof(high)); high.rc.rc_startblock = bno + len - 1; + low.rc.rc_domain = high.rc.rc_domain = domain; return xfs_btree_has_record(cur, &low, &high, exists); } diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index e8b322de7f3d..452f30556f5a 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -14,14 +14,33 @@ struct xfs_bmbt_irec; struct xfs_refcount_irec; extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); +static inline uint32_t +xfs_refcount_encode_startblock( + xfs_agblock_t startblock, + enum xfs_refc_domain domain) +{ + uint32_t start; + + /* + * low level btree operations need to handle the generic btree range + * query functions (which set rc_domain == -1U), so we check that the + * domain is /not/ shared. + */ + start = startblock & ~XFS_REFC_COWFLAG; + if (domain != XFS_REFC_DOMAIN_SHARED) + start |= XFS_REFC_COWFLAG; + + return start; +} + enum xfs_refcount_intent_type { XFS_REFCOUNT_INCREASE = 1, XFS_REFCOUNT_DECREASE, @@ -36,6 +55,18 @@ struct xfs_refcount_intent { xfs_fsblock_t ri_startblock; }; +/* Check that the refcount is appropriate for the record domain. */ +static inline bool +xfs_refcount_check_domain( + const struct xfs_refcount_irec *irec) +{ + if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1) + return false; + if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2) + return false; + return true; +} + void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); void xfs_refcount_decrease_extent(struct xfs_trans *tp, @@ -79,7 +110,8 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, #define XFS_REFCOUNT_ITEM_OVERHEAD 32 extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, - xfs_agblock_t bno, xfs_extlen_t len, bool *exists); + enum xfs_refc_domain domain, xfs_agblock_t bno, + xfs_extlen_t len, bool *exists); union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 316c1ec0c3c2..e1f789866683 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -13,6 +13,7 @@ #include "xfs_btree.h" #include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" +#include "xfs_refcount.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -160,7 +161,12 @@ xfs_refcountbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { - rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec->refc.rc_startblock = cpu_to_be32(start); rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); } @@ -182,10 +188,13 @@ xfs_refcountbt_key_diff( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { - struct xfs_refcount_irec *rec = &cur->bc_rec.rc; const struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; - return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; } STATIC int64_t diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 094dfc897ebc..b56aca1e7c66 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -235,13 +235,8 @@ xfs_rmap_get_rec( goto out_bad_rec; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(pag, irec->rm_startblock)) - goto out_bad_rec; - if (irec->rm_startblock > - irec->rm_startblock + irec->rm_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(pag, - irec->rm_startblock + irec->rm_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rm_startblock, + irec->rm_blockcount)) goto out_bad_rec; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 2c4ad6e4bb14..5b2f27cbdb80 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -422,7 +422,7 @@ xfs_calc_itruncate_reservation_minlogsize( /* * In renaming a files we can modify: - * the four inodes involved: 4 * inode size + * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets @@ -437,7 +437,7 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 4) + + max((xfs_calc_inode_res(mp, 5) + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index a6b7d98cf68f..5ebdda7e1078 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -166,6 +166,36 @@ typedef struct xfs_bmbt_irec xfs_exntst_t br_state; /* extent state */ } xfs_bmbt_irec_t; +enum xfs_refc_domain { + XFS_REFC_DOMAIN_SHARED = 0, + XFS_REFC_DOMAIN_COW, +}; + +#define XFS_REFC_DOMAIN_STRINGS \ + { XFS_REFC_DOMAIN_SHARED, "shared" }, \ + { XFS_REFC_DOMAIN_COW, "cow" } + +struct xfs_refcount_irec { + xfs_agblock_t rc_startblock; /* starting block number */ + xfs_extlen_t rc_blockcount; /* count of free blocks */ + xfs_nlink_t rc_refcount; /* number of inodes linked here */ + enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */ +}; + +#define XFS_RMAP_ATTR_FORK (1 << 0) +#define XFS_RMAP_BMBT_BLOCK (1 << 1) +#define XFS_RMAP_UNWRITTEN (1 << 2) +#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ + XFS_RMAP_BMBT_BLOCK) +#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) +struct xfs_rmap_irec { + xfs_agblock_t rm_startblock; /* extent start block */ + xfs_extlen_t rm_blockcount; /* extent length */ + uint64_t rm_owner; /* extent owner */ + uint64_t rm_offset; /* offset within the owner */ + unsigned int rm_flags; /* state flags */ +}; + /* per-AG block reservation types */ enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index ab427b4d7fe0..3b38f4e2a537 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -100,9 +100,7 @@ xchk_allocbt_rec( bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); - if (bno + len <= bno || - !xfs_verify_agbno(pag, bno) || - !xfs_verify_agbno(pag, bno + len - 1)) + if (!xfs_verify_agbext(pag, bno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_allocbt_xref(bs->sc, bno, len); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index b594f02a52c4..5c87800ab223 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -676,7 +676,7 @@ xchk_directory_blocks( xfs_dablk_t dabno; xfs_dir2_db_t last_data_db = 0; bool found; - int is_block = 0; + bool is_block = false; int error; /* Ignore local format directories. */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index e1026e07bf94..e312be7cd375 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -108,9 +108,8 @@ xchk_iallocbt_chunk( xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); - if (bno + len <= bno || - !xfs_verify_agbno(pag, bno) || - !xfs_verify_agbno(pag, bno + len - 1)) + + if (!xfs_verify_agbext(pag, bno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index c68b767dc08f..a26ee0f24ef2 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -269,15 +269,13 @@ done: STATIC void xchk_refcountbt_xref_rmap( struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_nlink_t refcount) + const struct xfs_refcount_irec *irec) { struct xchk_refcnt_check refchk = { - .sc = sc, - .bno = bno, - .len = len, - .refcount = refcount, + .sc = sc, + .bno = irec->rc_startblock, + .len = irec->rc_blockcount, + .refcount = irec->rc_refcount, .seen = 0, }; struct xfs_rmap_irec low; @@ -291,9 +289,9 @@ xchk_refcountbt_xref_rmap( /* Cross-reference with the rmapbt to confirm the refcount. */ memset(&low, 0, sizeof(low)); - low.rm_startblock = bno; + low.rm_startblock = irec->rc_startblock; memset(&high, 0xFF, sizeof(high)); - high.rm_startblock = bno + len - 1; + high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1; INIT_LIST_HEAD(&refchk.fragments); error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, @@ -302,7 +300,7 @@ xchk_refcountbt_xref_rmap( goto out_free; xchk_refcountbt_process_rmap_fragments(&refchk); - if (refcount != refchk.seen) + if (irec->rc_refcount != refchk.seen) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); out_free: @@ -315,17 +313,16 @@ out_free: /* Cross-reference with the other btrees. */ STATIC void xchk_refcountbt_xref( - struct xfs_scrub *sc, - xfs_agblock_t agbno, - xfs_extlen_t len, - xfs_nlink_t refcount) + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) { if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; - xchk_xref_is_used_space(sc, agbno, len); - xchk_xref_is_not_inode_chunk(sc, agbno, len); - xchk_refcountbt_xref_rmap(sc, agbno, len, refcount); + xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount); + xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock, + irec->rc_blockcount); + xchk_refcountbt_xref_rmap(sc, irec); } /* Scrub a refcountbt record. */ @@ -334,35 +331,27 @@ xchk_refcountbt_rec( struct xchk_btree *bs, const union xfs_btree_rec *rec) { + struct xfs_refcount_irec irec; xfs_agblock_t *cow_blocks = bs->private; struct xfs_perag *pag = bs->cur->bc_ag.pag; - xfs_agblock_t bno; - xfs_extlen_t len; - xfs_nlink_t refcount; - bool has_cowflag; - bno = be32_to_cpu(rec->refc.rc_startblock); - len = be32_to_cpu(rec->refc.rc_blockcount); - refcount = be32_to_cpu(rec->refc.rc_refcount); + xfs_refcount_btrec_to_irec(rec, &irec); - /* Only CoW records can have refcount == 1. */ - has_cowflag = (bno & XFS_REFC_COW_START); - if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag)) + /* Check the domain and refcount are not incompatible. */ + if (!xfs_refcount_check_domain(&irec)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (has_cowflag) - (*cow_blocks) += len; + + if (irec.rc_domain == XFS_REFC_DOMAIN_COW) + (*cow_blocks) += irec.rc_blockcount; /* Check the extent. */ - bno &= ~XFS_REFC_COW_START; - if (bno + len <= bno || - !xfs_verify_agbno(pag, bno) || - !xfs_verify_agbno(pag, bno + len - 1)) + if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (refcount == 0) + if (irec.rc_refcount == 0) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - xchk_refcountbt_xref(bs->sc, bno, len, refcount); + xchk_refcountbt_xref(bs->sc, &irec); return 0; } @@ -426,7 +415,6 @@ xchk_xref_is_cow_staging( xfs_extlen_t len) { struct xfs_refcount_irec rc; - bool has_cowflag; int has_refcount; int error; @@ -434,8 +422,8 @@ xchk_xref_is_cow_staging( return; /* Find the CoW staging extent. */ - error = xfs_refcount_lookup_le(sc->sa.refc_cur, - agbno + XFS_REFC_COW_START, &has_refcount); + error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW, + agbno, &has_refcount); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; if (!has_refcount) { @@ -451,9 +439,8 @@ xchk_xref_is_cow_staging( return; } - /* CoW flag must be set, refcount must be 1. */ - has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START); - if (!has_cowflag || rc.rc_refcount != 1) + /* CoW lookup returned a shared extent record? */ + if (rc.rc_domain != XFS_REFC_DOMAIN_COW) xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); /* Must be at least as long as what was passed in */ @@ -477,7 +464,8 @@ xchk_xref_is_not_shared( if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared); + error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED, + agbno, len, &shared); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; if (shared) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5077a7ad5646..2788a6f2edcd 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -86,8 +86,6 @@ xfs_attri_log_nameval_alloc( */ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + name_len + value_len); - if (!nv) - return nv; nv->name.i_addr = nv + 1; nv->name.i_len = name_len; @@ -247,28 +245,6 @@ xfs_attri_init( return attrip; } -/* - * Copy an attr format buffer from the given buf, and into the destination attr - * format structure. - */ -STATIC int -xfs_attri_copy_format( - struct xfs_log_iovec *buf, - struct xfs_attri_log_format *dst_attr_fmt) -{ - struct xfs_attri_log_format *src_attr_fmt = buf->i_addr; - size_t len; - - len = sizeof(struct xfs_attri_log_format); - if (buf->i_len != len) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; - } - - memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len); - return 0; -} - static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_attrd_log_item, attrd_item); @@ -441,8 +417,6 @@ xfs_attr_create_intent( attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, args->namelen, args->value, args->valuelen); } - if (!attr->xattri_nameval) - return ERR_PTR(-ENOMEM); attrip = xfs_attri_init(mp, attr->xattri_nameval); xfs_trans_add_item(tp, &attrip->attri_item); @@ -735,24 +709,50 @@ xlog_recover_attri_commit_pass2( struct xfs_attri_log_nameval *nv; const void *attr_value = NULL; const void *attr_name; - int error; + size_t len; attri_formatp = item->ri_buf[0].i_addr; attr_name = item->ri_buf[1].i_addr; /* Validate xfs_attri_log_format before the large memory allocation */ + len = sizeof(struct xfs_attri_log_format); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + if (!xfs_attri_validate(mp, attri_formatp)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + /* Validate the attr name */ + if (item->ri_buf[1].i_len != + xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[1].i_addr, item->ri_buf[1].i_len); return -EFSCORRUPTED; } - if (attri_formatp->alfi_value_len) + /* Validate the attr value, if present */ + if (attri_formatp->alfi_value_len != 0) { + if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + attr_value = item->ri_buf[2].i_addr; + } /* * Memory alloc failure will cause replay to abort. We attach the @@ -762,13 +762,9 @@ xlog_recover_attri_commit_pass2( nv = xfs_attri_log_nameval_alloc(attr_name, attri_formatp->alfi_name_len, attr_value, attri_formatp->alfi_value_len); - if (!nv) - return -ENOMEM; attrip = xfs_attri_init(mp, nv); - error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format); - if (error) - goto out; + memcpy(&attrip->attri_format, attri_formatp, len); /* * The ATTRI has two references. One for the ATTRD and one for ATTRI to @@ -780,10 +776,6 @@ xlog_recover_attri_commit_pass2( xfs_attri_release(attrip); xfs_attri_log_nameval_put(nv); return 0; -out: - xfs_attri_item_free(attrip); - xfs_attri_log_nameval_put(nv); - return error; } /* @@ -848,7 +840,8 @@ xlog_recover_attrd_commit_pass2( attrd_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 51f66e982484..41323da523d1 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -608,28 +608,18 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_relog = xfs_bui_item_relog, }; -/* - * Copy an BUI format buffer from the given buf, and into the destination - * BUI format structure. The BUI/BUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_bui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_bui_log_format *dst_bui_fmt) + struct xfs_bui_log_format *dst, + const struct xfs_bui_log_format *src) { - struct xfs_bui_log_format *src_bui_fmt; - uint len; + unsigned int i; - src_bui_fmt = buf->i_addr; - len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + memcpy(dst, src, offsetof(struct xfs_bui_log_format, bui_extents)); - if (buf->i_len == len) { - memcpy(dst_bui_fmt, src_bui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->bui_nextents; i++) + memcpy(&dst->bui_extents[i], &src->bui_extents[i], + sizeof(struct xfs_map_extent)); } /* @@ -646,23 +636,34 @@ xlog_recover_bui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_bui_log_item *buip; struct xfs_bui_log_format *bui_formatp; + size_t len; bui_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } - buip = xfs_bui_init(mp); - error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); - if (error) { - xfs_bui_item_free(buip); - return error; + + len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + buip = xfs_bui_init(mp); + xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -696,7 +697,8 @@ xlog_recover_bud_commit_pass2( bud_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index e295fc8062d8..9f3ceb461515 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -512,7 +512,7 @@ xfs_readdir( { struct xfs_da_args args = { NULL }; unsigned int lock_mode; - int isblock; + bool isblock; int error; trace_xfs_readdir(dp); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 296faa41d81d..c6b2aabd6f18 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -234,13 +234,18 @@ int xfs_errortag_init( struct xfs_mount *mp) { + int ret; + mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, KM_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; - return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, - &mp->m_kobj, "errortag"); + ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, + &mp->m_kobj, "errortag"); + if (ret) + kmem_free(mp->m_errortag); + return ret; } void @@ -274,7 +279,7 @@ xfs_errortag_test( ASSERT(error_tag < XFS_ERRTAG_MAX); randfactor = mp->m_errortag[error_tag]; - if (!randfactor || prandom_u32() % randfactor) + if (!randfactor || prandom_u32_max(randfactor)) return false; xfs_warn_ratelimited(mp, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 27ccfcd82f04..d5130d1fcfae 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -66,27 +66,16 @@ xfs_efi_release( xfs_efi_item_free(efip); } -/* - * This returns the number of iovecs needed to log the given efi item. - * We only need 1 iovec for an efi item. It just logs the efi_log_format - * structure. - */ -static inline int -xfs_efi_item_sizeof( - struct xfs_efi_log_item *efip) -{ - return sizeof(struct xfs_efi_log_format) + - (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); -} - STATIC void xfs_efi_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip)); + *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); } /* @@ -112,7 +101,7 @@ xfs_efi_item_format( xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, &efip->efi_format, - xfs_efi_item_sizeof(efip)); + xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents)); } @@ -155,13 +144,11 @@ xfs_efi_init( { struct xfs_efi_log_item *efip; - uint size; ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(struct xfs_efi_log_item) + - ((nextents - 1) * sizeof(xfs_extent_t))); - efip = kmem_zalloc(size, 0); + efip = kzalloc(xfs_efi_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); } else { efip = kmem_cache_zalloc(xfs_efi_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -188,15 +175,17 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; - uint len = sizeof(xfs_efi_log_format_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); - uint len32 = sizeof(xfs_efi_log_format_32_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); - uint len64 = sizeof(xfs_efi_log_format_64_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents); + uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents); + uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents); if (buf->i_len == len) { - memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); + memcpy(dst_efi_fmt, src_efi_fmt, + offsetof(struct xfs_efi_log_format, efi_extents)); + for (i = 0; i < src_efi_fmt->efi_nextents; i++) + memcpy(&dst_efi_fmt->efi_extents[i], + &src_efi_fmt->efi_extents[i], + sizeof(struct xfs_extent)); return 0; } else if (buf->i_len == len32) { xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; @@ -227,7 +216,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr, + buf->i_len); return -EFSCORRUPTED; } @@ -246,27 +236,16 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) kmem_cache_free(xfs_efd_cache, efdp); } -/* - * This returns the number of iovecs needed to log the given efd item. - * We only need 1 iovec for an efd item. It just logs the efd_log_format - * structure. - */ -static inline int -xfs_efd_item_sizeof( - struct xfs_efd_log_item *efdp) -{ - return sizeof(xfs_efd_log_format_t) + - (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); -} - STATIC void xfs_efd_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip)); + *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); } /* @@ -291,7 +270,7 @@ xfs_efd_item_format( xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, &efdp->efd_format, - xfs_efd_item_sizeof(efdp)); + xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents)); } /* @@ -340,9 +319,8 @@ xfs_trans_get_efd( ASSERT(nextents > 0); if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + - (nextents - 1) * sizeof(struct xfs_extent), - 0); + efdp = kzalloc(xfs_efd_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); } else { efdp = kmem_cache_zalloc(xfs_efd_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -733,6 +711,12 @@ xlog_recover_efi_commit_pass2( efi_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); if (error) { @@ -769,12 +753,24 @@ xlog_recover_efd_commit_pass2( xfs_lsn_t lsn) { struct xfs_efd_log_format *efd_formatp; + int buflen = item->ri_buf[0].i_len; efd_formatp = item->ri_buf[0].i_addr; - ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || - (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + + if (buflen < sizeof(struct xfs_efd_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } + + if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + efd_formatp->efd_nextents) && + item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + efd_formatp->efd_nextents)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id); return 0; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 186d0f2137f1..da6a5afa607c 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -52,6 +52,14 @@ struct xfs_efi_log_item { xfs_efi_log_format_t efi_format; }; +static inline size_t +xfs_efi_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_efi_log_item, efi_format) + + xfs_efi_log_format_sizeof(nr); +} + /* * This is the "extent free done" log item. It is used to log * the fact that some extents earlier mentioned in an efi item @@ -64,6 +72,14 @@ struct xfs_efd_log_item { xfs_efd_log_format_t efd_format; }; +static inline size_t +xfs_efd_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_efd_log_item, efd_format) + + xfs_efd_log_format_sizeof(nr); +} + /* * Max number of extents in fast allocation path. */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c6c80265c0b2..e462d39c840e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1261,7 +1261,7 @@ xfs_file_llseek( } #ifdef CONFIG_FS_DAX -static int +static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, enum page_entry_size pe_size, @@ -1274,14 +1274,15 @@ xfs_dax_fault( &xfs_read_iomap_ops); } #else -static int +static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, enum page_entry_size pe_size, bool write_fault, pfn_t *pfn) { - return 0; + ASSERT(0); + return VM_FAULT_SIGBUS; } #endif diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2bbe7916a998..eae7427062cf 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -596,7 +596,7 @@ xfs_iget_cache_miss( */ if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { - VFS_I(ip)->i_generation = prandom_u32(); + VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 28493c8e9bb2..aa303be11576 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -835,9 +835,8 @@ xfs_init_new_inode( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if (irix_sgid_inherit && - (inode->i_mode & S_ISGID) && - !in_group_p(i_gid_into_mnt(mnt_userns, inode))) + if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && + !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) inode->i_mode &= ~S_ISGID; ip->i_disk_size = 0; @@ -2819,7 +2818,7 @@ retry: * Lock all the participating inodes. Depending upon whether * the target_name exists in the target directory, and * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. + * directory, we can lock from 2 to 5 inodes. */ xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); @@ -3119,7 +3118,7 @@ xfs_iflush( if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, + "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto flush_out; } @@ -3129,7 +3128,7 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad regular inode %Lu, ptr "PTR_FMT, + "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } @@ -3140,7 +3139,7 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad directory inode %Lu, ptr "PTR_FMT, + "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } @@ -3158,7 +3157,7 @@ xfs_iflush( if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, + "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); goto flush_out; } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6e19ece916bf..ca2941ab6cbc 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -550,7 +550,7 @@ xfs_inode_item_push( if (!bp || (ip->i_flags & XFS_ISTALE)) { /* - * Inode item/buffer is being being aborted due to cluster + * Inode item/buffer is being aborted due to cluster * buffer deletion. Trigger a log force to have that operation * completed and items removed from the AIL before the next push * attempt. diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index d28ffaebd067..0e5dba2343ea 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -321,7 +321,7 @@ xlog_recover_inode_commit_pass2( */ if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, - "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld", __func__, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -329,7 +329,7 @@ xlog_recover_inode_commit_pass2( ldip = item->ri_buf[1].i_addr; if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", __func__, item, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f51c60d7e205..2e10e1c66ad6 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -167,7 +167,7 @@ xfs_generic_create( struct dentry *dentry, umode_t mode, dev_t rdev, - bool tmpfile) /* unnamed file */ + struct file *tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -234,7 +234,7 @@ xfs_generic_create( * d_tmpfile can immediately set it back to zero. */ set_nlink(inode, 1); - d_tmpfile(dentry, inode); + d_tmpfile(tmpfile, inode); } else d_instantiate(dentry, inode); @@ -261,7 +261,7 @@ xfs_vn_mknod( umode_t mode, dev_t rdev) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL); } STATIC int @@ -272,7 +272,7 @@ xfs_vn_create( umode_t mode, bool flags) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL); } STATIC int @@ -283,7 +283,7 @@ xfs_vn_mkdir( umode_t mode) { return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, - false); + NULL); } STATIC struct dentry * @@ -558,6 +558,8 @@ xfs_vn_getattr( struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); trace_xfs_getattr(ip); @@ -568,8 +570,8 @@ xfs_vn_getattr( stat->dev = inode->i_sb->s_dev; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = i_uid_into_mnt(mnt_userns, inode); - stat->gid = i_gid_into_mnt(mnt_userns, inode); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; @@ -1090,10 +1092,12 @@ STATIC int xfs_vn_tmpfile( struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, + struct file *file, umode_t mode) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true); + int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file); + + return finish_open_simple(file, err); } static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index cb5fc68c9ea0..e570dcb5df8d 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -13,7 +13,6 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); int xfs_vn_setattr_size(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *vap); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 36312b00b164..a1c2bcf65d37 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -66,6 +66,8 @@ xfs_bulkstat_one_int( struct xfs_bulkstat *buf = bc->buf; xfs_extnum_t nextents; int error = -EINVAL; + vfsuid_t vfsuid; + vfsgid_t vfsgid; if (xfs_internal_inum(mp, ino)) goto out_advance; @@ -81,14 +83,16 @@ xfs_bulkstat_one_int( ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* xfs_iget returns the following without needing * further change. */ buf->bs_projectid = ip->i_projid; buf->bs_ino = ino; - buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode)); - buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode)); + buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid)); + buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid)); buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 386b0307aed8..f02a0dd522b3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -226,12 +226,12 @@ xlog_ticket_reservation( if (head == &log->l_write_head) { ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); return tic->t_unit_res; - } else { - if (tic->t_flags & XLOG_TIC_PERM_RESERV) - return tic->t_unit_res * tic->t_cnt; - else - return tic->t_unit_res; } + + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + return tic->t_unit_res * tic->t_cnt; + + return tic->t_unit_res; } STATIC bool @@ -3544,7 +3544,7 @@ xlog_ticket_alloc( tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = prandom_u32(); + tic->t_tid = get_random_u32(); if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 17e923b9c5fa..322eb2ee6c55 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2552,6 +2552,8 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { + const struct xfs_item_ops *ops; + if (!xlog_item_is_intent(lip)) break; @@ -2567,13 +2569,17 @@ xlog_recover_process_intents( * deferred ops, you /must/ attach them to the capture list in * the recover routine or else those subsequent intents will be * replayed in the wrong order! + * + * The recovery function can free the log item, so we must not + * access lip after it returns. */ spin_unlock(&ailp->ail_lock); - error = lip->li_ops->iop_recover(lip, &capture_list); + ops = lip->li_ops; + error = ops->iop_recover(lip, &capture_list); spin_lock(&ailp->ail_lock); if (error) { trace_xlog_intent_recovery_failed(log->l_mp, error, - lip->li_ops->iop_recover); + ops->iop_recover); break; } } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f10c88cee116..e8bb3c2e847e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -300,26 +300,28 @@ xfs_validate_new_dalign( "alignment check failed: sunit/swidth vs. blocksize(%d)", mp->m_sb.sb_blocksize); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - mp->m_sb.sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, mp->m_sb.sb_blocksize); - return -EINVAL; - } } + /* + * Convert the stripe unit and width to FSBs. + */ + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); + return -EINVAL; + } + + if (!mp->m_dalign) { + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); + return -EINVAL; + } + + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); + if (!xfs_has_dalign(mp)) { xfs_warn(mp, "cannot change alignment: superblock does not support data alignment"); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 5b1f9a24ed59..c4078d0ec108 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -23,17 +23,18 @@ #include <linux/mm.h> #include <linux/dax.h> -struct failure_info { +struct xfs_failure_info { xfs_agblock_t startblock; xfs_extlen_t blockcount; int mf_flags; + bool want_shutdown; }; static pgoff_t xfs_failure_pgoff( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, - const struct failure_info *notify) + const struct xfs_failure_info *notify) { loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); @@ -47,7 +48,7 @@ static unsigned long xfs_failure_pgcnt( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, - const struct failure_info *notify) + const struct xfs_failure_info *notify) { xfs_agblock_t end_rec; xfs_agblock_t end_notify; @@ -71,13 +72,13 @@ xfs_dax_failure_fn( { struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; - struct failure_info *notify = data; + struct xfs_failure_info *notify = data; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); - return -EFSCORRUPTED; + notify->want_shutdown = true; + return 0; } /* Get files that incore, filter out others that are not in use. */ @@ -86,8 +87,10 @@ xfs_dax_failure_fn( /* Continue the rmap query if the inode isn't incore */ if (error == -ENODATA) return 0; - if (error) - return error; + if (error) { + notify->want_shutdown = true; + return 0; + } error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, xfs_failure_pgoff(mp, rec, notify), @@ -104,6 +107,7 @@ xfs_dax_notify_ddev_failure( xfs_daddr_t bblen, int mf_flags) { + struct xfs_failure_info notify = { .mf_flags = mf_flags }; struct xfs_trans *tp = NULL; struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; @@ -120,7 +124,6 @@ xfs_dax_notify_ddev_failure( for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; - struct failure_info notify; struct xfs_agf *agf; xfs_agblock_t agend; struct xfs_perag *pag; @@ -161,6 +164,11 @@ xfs_dax_notify_ddev_failure( } xfs_trans_cancel(tp); + if (error || notify.want_shutdown) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + if (!error) + error = -EFSCORRUPTED; + } return error; } diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 758702b9495f..9737b5a9f405 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -118,10 +118,10 @@ xfs_check_ondisk_structs(void) /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); @@ -134,6 +134,21 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40); XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + + XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); /* * The v5 superblock format extended several v4 header structures with diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7e97bf19793d..858e3e9eb4a8 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -523,7 +523,9 @@ xfs_cui_item_recover( type = refc_type; break; default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, + sizeof(cuip->cui_format)); error = -EFSCORRUPTED; goto abort_error; } @@ -536,7 +538,8 @@ xfs_cui_item_recover( &new_fsb, &new_len, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - refc, sizeof(*refc)); + &cuip->cui_format, + sizeof(cuip->cui_format)); if (error) goto abort_error; @@ -622,28 +625,18 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_relog = xfs_cui_item_relog, }; -/* - * Copy an CUI format buffer from the given buf, and into the destination - * CUI format structure. The CUI/CUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_cui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_cui_log_format *dst_cui_fmt) + struct xfs_cui_log_format *dst, + const struct xfs_cui_log_format *src) { - struct xfs_cui_log_format *src_cui_fmt; - uint len; + unsigned int i; - src_cui_fmt = buf->i_addr; - len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + memcpy(dst, src, offsetof(struct xfs_cui_log_format, cui_extents)); - if (buf->i_len == len) { - memcpy(dst_cui_fmt, src_cui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->cui_nextents; i++) + memcpy(&dst->cui_extents[i], &src->cui_extents[i], + sizeof(struct xfs_phys_extent)); } /* @@ -660,19 +653,28 @@ xlog_recover_cui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_cui_log_item *cuip; struct xfs_cui_log_format *cui_formatp; + size_t len; cui_formatp = item->ri_buf[0].i_addr; - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); - error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); - if (error) { - xfs_cui_item_free(cuip); - return error; + if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -706,7 +708,8 @@ xlog_recover_cud_commit_pass2( cud_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 251f20ddd368..93bdd25680bc 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -200,7 +200,9 @@ xfs_reflink_trim_around_shared( if (fbno == NULLAGBLOCK) { /* No shared blocks at all. */ return 0; - } else if (fbno == agbno) { + } + + if (fbno == agbno) { /* * The start of this extent is shared. Truncate the * mapping at the end of the shared region so that a @@ -210,16 +212,16 @@ xfs_reflink_trim_around_shared( irec->br_blockcount = flen; *shared = true; return 0; - } else { - /* - * There's a shared extent midway through this extent. - * Truncate the mapping at the start of the shared - * extent so that a subsequent iteration starts at the - * start of the shared region. - */ - irec->br_blockcount = fbno - agbno; - return 0; } + + /* + * There's a shared extent midway through this extent. + * Truncate the mapping at the start of the shared + * extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = fbno - agbno; + return 0; } int diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index fef92e02f3bb..534504ede1a3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -155,31 +155,6 @@ xfs_rui_init( return ruip; } -/* - * Copy an RUI format buffer from the given buf, and into the destination - * RUI format structure. The RUI/RUD items were designed not to need any - * special alignment handling. - */ -STATIC int -xfs_rui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_rui_log_format *dst_rui_fmt) -{ - struct xfs_rui_log_format *src_rui_fmt; - uint len; - - src_rui_fmt = buf->i_addr; - len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); - - if (buf->i_len != len) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; - } - - memcpy(dst_rui_fmt, src_rui_fmt, len); - return 0; -} - static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rud_log_item, rud_item); @@ -582,7 +557,9 @@ xfs_rui_item_recover( type = XFS_RMAP_FREE; break; default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &ruip->rui_format, + sizeof(ruip->rui_format)); error = -EFSCORRUPTED; goto abort_error; } @@ -652,6 +629,20 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_relog = xfs_rui_item_relog, }; +static inline void +xfs_rui_copy_format( + struct xfs_rui_log_format *dst, + const struct xfs_rui_log_format *src) +{ + unsigned int i; + + memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents)); + + for (i = 0; i < src->rui_nextents; i++) + memcpy(&dst->rui_extents[i], &src->rui_extents[i], + sizeof(struct xfs_map_extent)); +} + /* * This routine is called to create an in-core extent rmap update * item from the rui format structure which was logged on disk. @@ -666,19 +657,28 @@ xlog_recover_rui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_rui_log_item *ruip; struct xfs_rui_log_format *rui_formatp; + size_t len; rui_formatp = item->ri_buf[0].i_addr; - ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); - error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); - if (error) { - xfs_rui_item_free(ruip); - return error; + if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -711,7 +711,11 @@ xlog_recover_rud_commit_pass2( struct xfs_rud_log_format *rud_formatp; rud_formatp = item->ri_buf[0].i_addr; - ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); + if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + rud_formatp, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id); return 0; diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 20e0534a772c..90a77cd3ebad 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -74,7 +74,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } - len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", defer_relog); @@ -125,7 +125,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) { int j; - seq_printf(m, "qm"); + seq_puts(m, "qm"); for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f029c6702dda..ee4b429a2f2c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2028,18 +2028,14 @@ xfs_init_caches(void) goto out_destroy_trans_cache; xfs_efd_cache = kmem_cache_create("xfs_efd_item", - (sizeof(struct xfs_efd_log_item) + - (XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_extent)), - 0, 0, NULL); + xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS), + 0, 0, NULL); if (!xfs_efd_cache) goto out_destroy_buf_item_cache; xfs_efi_cache = kmem_cache_create("xfs_efi_item", - (sizeof(struct xfs_efi_log_item) + - (XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_extent)), - 0, 0, NULL); + xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS), + 0, 0, NULL); if (!xfs_efi_cache) goto out_destroy_efd_cache; diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 43585850f154..513095e353a5 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -33,10 +33,15 @@ xfs_sysfs_init( const char *name) { struct kobject *parent; + int err; parent = parent_kobj ? &parent_kobj->kobject : NULL; init_completion(&kobj->complete); - return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + if (err) + kobject_put(&kobj->kobject); + + return err; } static inline void diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f9057af6e0c8..372d871bccc5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -799,6 +799,9 @@ TRACE_DEFINE_ENUM(PE_SIZE_PTE); TRACE_DEFINE_ENUM(PE_SIZE_PMD); TRACE_DEFINE_ENUM(PE_SIZE_PUD); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); + TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), @@ -1170,7 +1173,7 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class, __entry->ino_res_used = qtrx->qt_ino_res_used; __entry->icount_delta = qtrx->qt_icount_delta; ), - TP_printk("dev %d:%d dquot id 0x%x type %s flags %s" + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s " "blk_res %llu bcount_delta %lld delbcnt_delta %lld " "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " "ino_res %llu ino_res_used %llu icount_delta %lld", @@ -1602,7 +1605,7 @@ TRACE_EVENT(xfs_bunmap, __entry->caller_ip = caller_ip; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx" + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx " "flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -2925,6 +2928,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) @@ -2932,13 +2936,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) @@ -2958,6 +2964,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) @@ -2966,14 +2973,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount, @@ -2994,9 +3003,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) @@ -3004,20 +3015,24 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount) @@ -3038,9 +3053,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) @@ -3049,21 +3066,25 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, @@ -3086,12 +3107,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) + __field(enum xfs_refc_domain, i3_domain) __field(xfs_agblock_t, i3_startblock) __field(xfs_extlen_t, i3_blockcount) __field(xfs_nlink_t, i3_refcount) @@ -3099,27 +3123,33 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; + __entry->i3_domain = i3->rc_domain; __entry->i3_startblock = i3->rc_startblock; __entry->i3_blockcount = i3->rc_blockcount; __entry->i3_refcount = i3->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, + __print_symbolic(__entry->i3_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i3_startblock, __entry->i3_blockcount, __entry->i3_refcount) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d3a97a028560..f51df7d94ef7 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -602,9 +602,9 @@ xfsaild( while (1) { if (tout && tout <= 20) - set_current_state(TASK_KILLABLE); + set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); /* * Check kthread_should_stop() after we set the task state to @@ -653,14 +653,14 @@ xfsaild( ailp->ail_target == ailp->ail_target_prev && list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); - freezable_schedule(); + schedule(); tout = 0; continue; } spin_unlock(&ailp->ail_lock); if (tout) - freezable_schedule_timeout(msecs_to_jiffies(tout)); + schedule_timeout(msecs_to_jiffies(tout)); __set_current_state(TASK_RUNNING); @@ -730,11 +730,10 @@ void xfs_ail_push_all_sync( struct xfs_ail *ailp) { - struct xfs_log_item *lip; DEFINE_WAIT(wait); spin_lock(&ailp->ail_lock); - while ((lip = xfs_ail_max(ailp)) != NULL) { + while (xfs_ail_max(ailp) != NULL) { prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); wake_up_process(ailp->ail_task); spin_unlock(&ailp->ail_lock); |