diff options
Diffstat (limited to 'fs')
66 files changed, 576 insertions, 278 deletions
diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 60685ec76d98..2e612834329a 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -105,6 +105,7 @@ struct affs_sb_info { int work_queued; /* non-zero delayed work is queued */ struct delayed_work sb_work; /* superblock flush delayed work */ spinlock_t work_lock; /* protects sb_work and work_queued */ + struct rcu_head rcu; }; #define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */ diff --git a/fs/affs/super.c b/fs/affs/super.c index 58b391446ae1..b56a95cf414a 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -640,7 +640,7 @@ static void affs_kill_sb(struct super_block *sb) affs_brelse(sbi->s_root_bh); kfree(sbi->s_prefix); mutex_destroy(&sbi->s_bmlock); - kfree(sbi); + kfree_rcu(sbi, rcu); } } diff --git a/fs/afs/file.c b/fs/afs/file.c index 3d33b221d9ca..ef2cc8f565d2 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -417,13 +417,17 @@ static void afs_add_open_mmap(struct afs_vnode *vnode) static void afs_drop_open_mmap(struct afs_vnode *vnode) { - if (!atomic_dec_and_test(&vnode->cb_nr_mmap)) + if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1)) return; down_write(&vnode->volume->open_mmaps_lock); - if (atomic_read(&vnode->cb_nr_mmap) == 0) + read_seqlock_excl(&vnode->cb_lock); + // the only place where ->cb_nr_mmap may hit 0 + // see __afs_break_callback() for the other side... + if (atomic_dec_and_test(&vnode->cb_nr_mmap)) list_del_init(&vnode->cb_mmap_link); + read_sequnlock_excl(&vnode->cb_lock); up_write(&vnode->volume->open_mmaps_lock); flush_work(&vnode->cb_work); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9c03fcf7ffaa..6ce5a612937c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -321,8 +321,7 @@ struct afs_net { struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ - struct hlist_head fs_addresses4; /* afs_server (by lowest IPv4 addr) */ - struct hlist_head fs_addresses6; /* afs_server (by lowest IPv6 addr) */ + struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ seqlock_t fs_addr_lock; /* For fs_addresses[46] */ struct work_struct fs_manager; @@ -561,8 +560,7 @@ struct afs_server { struct afs_server __rcu *uuid_next; /* Next server with same UUID */ struct afs_server *uuid_prev; /* Previous server with same UUID */ struct list_head probe_link; /* Link in net->fs_probe_list */ - struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ - struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ + struct hlist_node addr_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ struct afs_server *gc_next; /* Next server in manager's list */ diff --git a/fs/afs/main.c b/fs/afs/main.c index 1b3bd21c168a..a14f6013e316 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -90,8 +90,7 @@ static int __net_init afs_net_init(struct net *net_ns) INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); - INIT_HLIST_HEAD(&net->fs_addresses4); - INIT_HLIST_HEAD(&net->fs_addresses6); + INIT_HLIST_HEAD(&net->fs_addresses); seqlock_init(&net->fs_addr_lock); INIT_WORK(&net->fs_manager, afs_manage_servers); diff --git a/fs/afs/server.c b/fs/afs/server.c index e169121f603e..038f9d0ae3af 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -38,7 +38,7 @@ struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { + hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) { estate = rcu_dereference(server->endpoint_state); alist = estate->addresses; for (i = 0; i < alist->nr_addrs; i++) @@ -177,10 +177,8 @@ added_dup: * bit, but anything we might want to do gets messy and memory * intensive. */ - if (alist->nr_ipv4 > 0) - hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4); - if (alist->nr_addrs > alist->nr_ipv4) - hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6); + if (alist->nr_addrs > 0) + hlist_add_head_rcu(&server->addr_link, &net->fs_addresses); write_sequnlock(&net->fs_addr_lock); @@ -511,10 +509,8 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) list_del(&server->probe_link); hlist_del_rcu(&server->proc_link); - if (!hlist_unhashed(&server->addr4_link)) - hlist_del_rcu(&server->addr4_link); - if (!hlist_unhashed(&server->addr6_link)) - hlist_del_rcu(&server->addr6_link); + if (!hlist_unhashed(&server->addr_link)) + hlist_del_rcu(&server->addr_link); } write_sequnlock(&net->fs_lock); diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 020ecd45e476..af3a3f57c1b3 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -353,7 +353,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) { struct afs_server_list *new, *old, *discard; struct afs_vldb_entry *vldb; - char idbuf[16]; + char idbuf[24]; int ret, idsz; _enter(""); @@ -361,7 +361,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) /* We look up an ID by passing it as a decimal string in the * operation's name parameter. */ - idsz = sprintf(idbuf, "%llu", volume->vid); + idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid); vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz); if (IS_ERR(vldb)) { @@ -593,6 +593,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) struct kioctx *ctx = req->ki_ctx; unsigned long flags; + /* + * kiocb didn't come from aio or is neither a read nor a write, hence + * ignore it. + */ + if (!(iocb->ki_flags & IOCB_AIO_RW)) + return; + if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return; @@ -1509,7 +1516,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; - req->ki_flags = req->ki_filp->f_iocb_flags; + req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b80c6c9efd8c..69d0d60d50e3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1249,6 +1249,18 @@ static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) return stdio; } +static inline unsigned metadata_replicas_required(struct bch_fs *c) +{ + return min(c->opts.metadata_replicas, + c->opts.metadata_replicas_required); +} + +static inline unsigned data_replicas_required(struct bch_fs *c) +{ + return min(c->opts.data_replicas, + c->opts.data_replicas_required); +} + #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 17a5938aa71a..4530b14ff2c3 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -280,7 +280,8 @@ retry: writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, - c->opts.metadata_replicas_required, + min(res->nr_replicas, + c->opts.metadata_replicas_required), watermark, 0, cl, &wp); if (unlikely(ret)) return ERR_PTR(ret); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index ec419b8e2c43..77ae65542db9 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -435,7 +435,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, bch2_subvol_is_ro(c, inode->ei_subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) - return ret; + return bch2_err_class(ret); ihold(&inode->v); d_instantiate(dentry, &inode->v); @@ -487,8 +487,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; - return bch2_subvol_is_ro(c, dir->ei_subvol) ?: + int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: __bch2_unlink(vdir, dentry, false); + return bch2_err_class(ret); } static int bch2_symlink(struct mnt_idmap *idmap, @@ -523,7 +524,7 @@ static int bch2_symlink(struct mnt_idmap *idmap, return 0; err: iput(&inode->v); - return ret; + return bch2_err_class(ret); } static int bch2_mkdir(struct mnt_idmap *idmap, @@ -641,7 +642,7 @@ err: src_inode, dst_inode); - return ret; + return bch2_err_class(ret); } static void bch2_setattr_copy(struct mnt_idmap *idmap, diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index ef3a53f9045a..2c098ac017b3 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1564,6 +1564,7 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index bfd6585e746d..47805193f18c 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1478,6 +1478,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) c->opts.foreground_target; unsigned i, replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_need = min_t(unsigned, replicas_want, + READ_ONCE(c->opts.metadata_replicas_required)); rcu_read_lock(); retry: @@ -1526,7 +1528,7 @@ done: BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; + return replicas >= replicas_need ? 0 : -EROFS; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 820d25e19e5f..2cf626315652 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -205,7 +205,7 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; - if (nr_online < c->opts.metadata_replicas_required) { + if (nr_online < metadata_replicas_required(c)) { ret = JOURNAL_ERR_insufficient_devices; goto out; } diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index accf246c3233..b27d22925929 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -56,6 +56,7 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) va_copy(args2, args); len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + va_end(args2); } while (len + 1 >= printbuf_remaining(out) && !bch2_printbuf_make_room(out, len + 1)); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 9127d0e3ca2f..21e13bb4335b 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -577,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v) static bool check_version_upgrade(struct bch_fs *c) { - unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); unsigned latest_version = bcachefs_metadata_version_current; + unsigned latest_compatible = min(latest_version, + bch2_latest_compatible_version(c->sb.version)); unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; @@ -597,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c) new_version = latest_version; break; case BCH_VERSION_UPGRADE_none: - new_version = old_version; + new_version = min(old_version, latest_version); break; } } @@ -774,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!(c->opts.nochanges && c->opts.norecovery)) { + if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); bool write_sb = false; @@ -804,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (bch2_check_version_downgrade(c)) { struct printbuf buf = PRINTBUF; - prt_str(&buf, "Version downgrade required:\n"); + prt_str(&buf, "Version downgrade required:"); __le64 passes = ext->recovery_passes_required[0]; bch2_sb_set_downgrade(c, @@ -812,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c) BCH_VERSION_MINOR(c->sb.version)); passes = ext->recovery_passes_required[0] & ~passes; if (passes) { - prt_str(&buf, " running recovery passes: "); + prt_str(&buf, "\n running recovery passes: "); prt_bitflags(&buf, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index a45354d2acde..eff5ce18c69c 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -421,7 +421,7 @@ void bch2_dev_errors_reset(struct bch_dev *ca) m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); - m->errors_reset_time = ktime_get_real_seconds(); + m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds()); bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index d60c7d27a047..36988add581f 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -717,7 +717,7 @@ retry: if (IS_ERR(sb->bdev_handle)) { ret = PTR_ERR(sb->bdev_handle); - goto out; + goto err; } sb->bdev = sb->bdev_handle->bdev; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index b9911402b175..6b23e11825e6 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1428,10 +1428,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ? c->opts.metadata_replicas - : c->opts.metadata_replicas_required, + : metadata_replicas_required(c), !(flags & BCH_FORCE_IF_DATA_DEGRADED) ? c->opts.data_replicas - : c->opts.data_replicas_required); + : data_replicas_required(c)); return nr_rw >= required; case BCH_MEMBER_STATE_failed: diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a9be9ac99222..378d9103a207 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1455,6 +1455,7 @@ out: */ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) { + LIST_HEAD(retry_list); struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; @@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { + u64 used; int trimming; block_group = list_first_entry(&fs_info->unused_bgs, @@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) goto next; } + spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (block_group->reserved || block_group->pinned || - block_group->used || block_group->ro || + if (btrfs_is_block_group_used(block_group) || block_group->ro || list_is_singular(&block_group->list)) { /* * We want to bail if we made new allocations or have @@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } + + /* + * The block group may be unused but there may be space reserved + * accounting with the existence of that block group, that is, + * space_info->bytes_may_use was incremented by a task but no + * space was yet allocated from the block group by the task. + * That space may or may not be allocated, as we are generally + * pessimistic about space reservation for metadata as well as + * for data when using compression (as we reserve space based on + * the worst case, when data can't be compressed, and before + * actually attempting compression, before starting writeback). + * + * So check if the total space of the space_info minus the size + * of this block group is less than the used space of the + * space_info - if that's the case, then it means we have tasks + * that might be relying on the block group in order to allocate + * extents, and add back the block group to the unused list when + * we finish, so that we retry later in case no tasks ended up + * needing to allocate extents from the block group. + */ + used = btrfs_space_info_used(space_info, true); + if (space_info->total_bytes - block_group->length < used) { + /* + * Add a reference for the list, compensate for the ref + * drop under the "next" label for the + * fs_info->unused_bgs list. + */ + btrfs_get_block_group(block_group); + list_add_tail(&block_group->bg_list, &retry_list); + + trace_btrfs_skip_unused_block_group(block_group); + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); /* We don't want to force the issue, only flip if it's ok. */ ret = inc_block_group_ro(block_group, 0); @@ -1650,12 +1691,16 @@ next: btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } + list_splice_tail(&retry_list, &fs_info->unused_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); return; flip_async: btrfs_end_transaction(trans); + spin_lock(&fs_info->unused_bgs_lock); + list_splice_tail(&retry_list, &fs_info->unused_bgs); + spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_put_block_group(block_group); btrfs_discard_punt_unused_bgs_list(fs_info); @@ -2684,6 +2729,37 @@ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + + /* + * If the block group is still unused, add it to the list of + * unused block groups. The block group may have been created in + * order to satisfy a space reservation, in which case the + * extent allocation only happens later. But often we don't + * actually need to allocate space that we previously reserved, + * so the block group may become unused for a long time. For + * example for metadata we generally reserve space for a worst + * possible scenario, but then don't end up allocating all that + * space or none at all (due to no need to COW, extent buffers + * were already COWed in the current transaction and still + * unwritten, tree heights lower than the maximum possible + * height, etc). For data we generally reserve the axact amount + * of space we are going to allocate later, the exception is + * when using compression, as we must reserve space based on the + * uncompressed data size, because the compression is only done + * when writeback triggered and we don't know how much space we + * are actually going to need, so we reserve the uncompressed + * size because the data may be uncompressible in the worst case. + */ + if (ret == 0) { + bool used; + + spin_lock(&block_group->lock); + used = btrfs_is_block_group_used(block_group); + spin_unlock(&block_group->lock); + + if (!used) + btrfs_mark_bg_unused(block_group); + } } btrfs_trans_release_chunk_metadata(trans); } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index c4a1f01cc1c2..962b11983901 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) return (block_group->start + block_group->length); } +static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) +{ + lockdep_assert_held(&bg->lock); + + return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); +} + static inline bool btrfs_is_block_group_data_only( struct btrfs_block_group *block_group) { diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index c276b136ab63..5b0b64571418 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1046,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto add; /* Skip too large extent */ - if (range_len >= extent_thresh) + if (em->len >= extent_thresh) goto next; /* diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 2833e8ef4c09..acf9f4b6c044 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 reserve_size = 0; u64 qgroup_rsv_size = 0; - u64 csum_leaves; unsigned outstanding_extents; lockdep_assert_held(&inode->lock); @@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, outstanding_extents); reserve_size += btrfs_calc_metadata_size(fs_info, 1); } - csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, - inode->csum_bytes); - reserve_size += btrfs_calc_insert_metadata_size(fs_info, - csum_leaves); + if (!(inode->flags & BTRFS_INODE_NODATASUM)) { + u64 csum_leaves; + + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); + reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves); + } /* * For qgroup rsv, the calculation is very simple: * account one nodesize for each outstanding extent @@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); } -static void calc_inode_reservations(struct btrfs_fs_info *fs_info, +static void calc_inode_reservations(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, u64 *meta_reserve, u64 *qgroup_reserve) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 nr_extents = count_max_extents(fs_info, num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); + u64 csum_leaves; u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); + if (inode->flags & BTRFS_INODE_NODATASUM) + csum_leaves = 0; + else + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); + *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, nr_extents + csum_leaves); @@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * everything out and try again, which is bad. This way we just * over-reserve slightly, and clean up the mess when we are done. */ - calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, + calc_inode_reservations(inode, num_bytes, disk_num_bytes, &meta_reserve, &qgroup_reserve); ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, noflush); @@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, nr_extents = count_max_extents(fs_info, num_bytes); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += disk_num_bytes; + if (!(inode->flags & BTRFS_INODE_NODATASUM)) + inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&inode->lock); - inode->csum_bytes -= num_bytes; + if (!(inode->flags & BTRFS_INODE_NODATASUM)) + inode->csum_bytes -= num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cfd2967f04a2..3e19a2475ab3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2689,16 +2689,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode, * it beyond i_size. */ while (cur_offset < end && cur_offset < i_size) { + struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; u64 prealloc_start; + u64 lockstart; + u64 lockend; u64 prealloc_len = 0; bool delalloc; + lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize); + lockend = round_up(end, inode->root->fs_info->sectorsize); + + /* + * We are only locking for the delalloc range because that's the + * only thing that can change here. With fiemap we have a lock + * on the inode, so no buffered or direct writes can happen. + * + * However mmaps and normal page writeback will cause this to + * change arbitrarily. We have to lock the extent lock here to + * make sure that nobody messes with the tree while we're doing + * btrfs_find_delalloc_in_range. + */ + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, delalloc_cached_state, &delalloc_start, &delalloc_end); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) break; @@ -2866,15 +2884,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { const u64 ino = btrfs_ino(inode); - struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end; u64 prev_extent_end; - u64 lockstart; - u64 lockend; + u64 range_start; + u64 range_end; + const u64 sectorsize = inode->root->fs_info->sectorsize; bool stopped = false; int ret; @@ -2885,12 +2903,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - lockstart = round_down(start, inode->root->fs_info->sectorsize); - lockend = round_up(start + len, inode->root->fs_info->sectorsize); - prev_extent_end = lockstart; + range_start = round_down(start, sectorsize); + range_end = round_up(start + len, sectorsize); + prev_extent_end = range_start; btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) @@ -2898,7 +2915,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, btrfs_release_path(path); path->reada = READA_FORWARD; - ret = fiemap_search_slot(inode, path, lockstart); + ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { goto out_unlock; } else if (ret > 0) { @@ -2910,7 +2927,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto check_eof_delalloc; } - while (prev_extent_end < lockend) { + while (prev_extent_end < range_end) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *ei; struct btrfs_key key; @@ -2933,19 +2950,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, * The first iteration can leave us at an extent item that ends * before our range's start. Move to the next item. */ - if (extent_end <= lockstart) + if (extent_end <= range_start) goto next_item; backref_ctx->curr_leaf_bytenr = leaf->start; /* We have in implicit hole (NO_HOLES feature enabled). */ if (prev_extent_end < key.offset) { - const u64 range_end = min(key.offset, lockend) - 1; + const u64 hole_end = min(key.offset, range_end) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, 0, 0, 0, - prev_extent_end, range_end); + prev_extent_end, hole_end); if (ret < 0) { goto out_unlock; } else if (ret > 0) { @@ -2955,7 +2972,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } /* We've reached the end of the fiemap range, stop. */ - if (key.offset >= lockend) { + if (key.offset >= range_end) { stopped = true; break; } @@ -3049,29 +3066,41 @@ check_eof_delalloc: btrfs_free_path(path); path = NULL; - if (!stopped && prev_extent_end < lockend) { + if (!stopped && prev_extent_end < range_end) { ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, - 0, 0, 0, prev_extent_end, lockend - 1); + 0, 0, 0, prev_extent_end, range_end - 1); if (ret < 0) goto out_unlock; - prev_extent_end = lockend; + prev_extent_end = range_end; } if (cache.cached && cache.offset + cache.len >= last_extent_end) { const u64 i_size = i_size_read(&inode->vfs_inode); if (prev_extent_end < i_size) { + struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; + u64 lockstart; + u64 lockend; bool delalloc; + lockstart = round_down(prev_extent_end, sectorsize); + lockend = round_up(i_size, sectorsize); + + /* + * See the comment in fiemap_process_hole as to why + * we're doing the locking here. + */ + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, prev_extent_end, i_size - 1, &delalloc_cached_state, &delalloc_start, &delalloc_end); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) cache.flags |= FIEMAP_EXTENT_LAST; } else { @@ -3082,7 +3111,6 @@ check_eof_delalloc: ret = emit_last_fiemap_cache(fieinfo, &cache); out_unlock: - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1eb93d3962aa..f88e0ca8331d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3184,8 +3184,23 @@ out: unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* Drop extent maps for the part of the extent we didn't write. */ - btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + /* + * Drop extent maps for the part of the extent we didn't write. + * + * We have an exception here for the free_space_inode, this is + * because when we do btrfs_get_extent() on the free space inode + * we will search the commit root. If this is a new block group + * we won't find anything, and we will trip over the assert in + * writepage where we do ASSERT(em->block_start != + * EXTENT_MAP_HOLE). + * + * Theoretically we could also skip this for any NOCOW extent as + * we don't mess with the extent map tree in the NOCOW case, but + * for now simply skip this if we are the free space inode. + */ + if (!btrfs_is_free_space_inode(inode)) + btrfs_drop_extent_map_range(inode, unwritten_start, + end, false); /* * If the ordered extent had an IOERR or something else went @@ -10273,6 +10288,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) return -EINVAL; + /* + * Compressed extents should always have checksums, so error out if we + * have a NOCOW file or inode was created while mounted with NODATASUM. + */ + if (inode->flags & BTRFS_INODE_NODATASUM) + return -EINVAL; + orig_count = iov_iter_count(from); /* The extent size must be sane. */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5b3333ceef04..c52807d97efa 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 *delayed_refs_bytes) { - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; - u64 extra_delayed_refs_bytes = 0; - u64 bytes; + u64 bytes = num_bytes + *delayed_refs_bytes; int ret; /* - * If there's a gap between the size of the delayed refs reserve and - * its reserved space, than some tasks have added delayed refs or bumped - * its size otherwise (due to block group creation or removal, or block - * group item update). Also try to allocate that gap in order to prevent - * using (and possibly abusing) the global reserve when committing the - * transaction. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL && - !btrfs_block_rsv_full(delayed_refs_rsv)) { - spin_lock(&delayed_refs_rsv->lock); - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) - extra_delayed_refs_bytes = delayed_refs_rsv->size - - delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - } - - bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes; - - /* * We want to reserve all the bytes we may need all at once, so we only * do 1 enospc flushing cycle per transaction start. */ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); - if (ret == 0) { - if (extra_delayed_refs_bytes > 0) - btrfs_migrate_to_delayed_refs_rsv(fs_info, - extra_delayed_refs_bytes); - return 0; - } - - if (extra_delayed_refs_bytes > 0) { - bytes -= extra_delayed_refs_bytes; - ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); - if (ret == 0) - return 0; - } /* * If we are an emergency flush, which can steal from the global block * reserve, then attempt to not reserve space for the delayed refs, as * we will consume space for them from the global block reserve. */ - if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { bytes -= *delayed_refs_bytes; *delayed_refs_bytes = 0; ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 168af9d000d1..3a5d69ff25fc 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1670,6 +1670,7 @@ out: } bitmap_free(active); kfree(zone_info); + btrfs_free_chunk_map(map); return ret; } diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 7077f72e6f47..f449f7340aad 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -168,6 +168,8 @@ error_unsupported: dput(root); error_open_root: cachefiles_end_secure(cache, saved_cred); + put_cred(cache->cache_cred); + cache->cache_cred = NULL; error_getsec: fscache_relinquish_cache(cache_cookie); cache->cache = NULL; diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 3f24905f4066..6465e2574230 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) cachefiles_put_directory(cache->graveyard); cachefiles_put_directory(cache->store); mntput(cache->mnt); + put_cred(cache->cache_cred); kfree(cache->rootdirname); kfree(cache->secctx); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ad1f46c66fbf..7fb4aae97412 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2156,6 +2156,30 @@ retry: ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); + /* completed revocation? going down and there are no caps? */ + if (revoking) { + if ((revoking & cap_used) == 0) { + doutc(cl, "completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* + * If the "i_wrbuffer_ref" was increased by mmap or generic + * cache write just before the ceph_check_caps() is called, + * the Fb capability revoking will fail this time. Then we + * must wait for the BDI's delayed work to flush the dirty + * pages and to release the "i_wrbuffer_ref", which will cost + * at most 5 seconds. That means the MDS needs to wait at + * most 5 seconds to finished the Fb capability's revocation. + * + * Let's queue a writeback for it. + */ + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && + (revoking & CEPH_CAP_FILE_BUFFER)) + queue_writeback = true; + } + if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ @@ -2183,30 +2207,6 @@ retry: } } - /* completed revocation? going down and there are no caps? */ - if (revoking) { - if ((revoking & cap_used) == 0) { - doutc(cl, "completed revocation of %s\n", - ceph_cap_string(cap->implemented & ~cap->issued)); - goto ack; - } - - /* - * If the "i_wrbuffer_ref" was increased by mmap or generic - * cache write just before the ceph_check_caps() is called, - * the Fb capability revoking will fail this time. Then we - * must wait for the BDI's delayed work to flush the dirty - * pages and to release the "i_wrbuffer_ref", which will cost - * at most 5 seconds. That means the MDS needs to wait at - * most 5 seconds to finished the Fb capability's revocation. - * - * Let's queue a writeback for it. - */ - if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && - (revoking & CEPH_CAP_FILE_BUFFER)) - queue_writeback = true; - } - /* want more caps from mds? */ if (want & ~cap->mds_wanted) { if (want & ~(cap->mds_wanted | cap->issued)) @@ -4772,7 +4772,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode) if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; - __cap_delay_requeue_front(mdsc, ci); + + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, + ceph_vinop(inode)); + spin_lock(&mdsc->cap_unlink_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add_tail(&ci->i_cap_delay_list, + &mdsc->cap_unlink_delay_list); + spin_unlock(&mdsc->cap_unlink_delay_lock); + + /* + * Fire the work immediately, because the MDS maybe + * waiting for caps release. + */ + ceph_queue_cap_unlink_work(mdsc); } } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f71bb9c9569f..3ab9c268a8bb 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2484,6 +2484,50 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) } } +void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc) +{ + struct ceph_client *cl = mdsc->fsc->client; + if (mdsc->stopping) + return; + + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) { + doutc(cl, "caps unlink work queued\n"); + } else { + doutc(cl, "failed to queue caps unlink work\n"); + } +} + +static void ceph_cap_unlink_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_unlink_work); + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "begin\n"); + spin_lock(&mdsc->cap_unlink_delay_lock); + while (!list_empty(&mdsc->cap_unlink_delay_list)) { + struct ceph_inode_info *ci; + struct inode *inode; + + ci = list_first_entry(&mdsc->cap_unlink_delay_list, + struct ceph_inode_info, + i_cap_delay_list); + list_del_init(&ci->i_cap_delay_list); + + inode = igrab(&ci->netfs.inode); + if (inode) { + spin_unlock(&mdsc->cap_unlink_delay_lock); + doutc(cl, "on %p %llx.%llx\n", inode, + ceph_vinop(inode)); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); + iput(inode); + spin_lock(&mdsc->cap_unlink_delay_lock); + } + } + spin_unlock(&mdsc->cap_unlink_delay_lock); + doutc(cl, "done\n"); +} + /* * requests */ @@ -5359,6 +5403,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->cap_delay_list); INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); + INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); + spin_lock_init(&mdsc->cap_unlink_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; @@ -5367,6 +5413,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); + INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work); err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; @@ -5640,6 +5687,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) ceph_cleanup_global_and_empty_realms(mdsc); cancel_work_sync(&mdsc->cap_reclaim_work); + cancel_work_sync(&mdsc->cap_unlink_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ doutc(cl, "done\n"); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 40560af38827..03f8ff00874f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -462,6 +462,8 @@ struct ceph_mds_client { unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ spinlock_t cap_delay_lock; /* protects cap_delay_list */ + struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ + spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; @@ -475,6 +477,8 @@ struct ceph_mds_client { struct work_struct cap_reclaim_work; atomic_t cap_reclaim_pending; + struct work_struct cap_unlink_work; + /* * Cap reservations * @@ -574,6 +578,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); +extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc); extern int ceph_iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, int mds, void *), void *arg); diff --git a/fs/dcache.c b/fs/dcache.c index b813528fb147..6ebccba33336 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3061,7 +3061,10 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; - dentry->d_lockref.count--; + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_lockref.count--; + } } return D_WALK_CONTINUE; } diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 5ff90026fd43..89a7c2453aae 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -381,11 +381,12 @@ static int erofs_fscache_init_domain(struct super_block *sb) goto out; if (!erofs_pseudo_mnt) { - erofs_pseudo_mnt = kern_mount(&erofs_fs_type); - if (IS_ERR(erofs_pseudo_mnt)) { - err = PTR_ERR(erofs_pseudo_mnt); + struct vfsmount *mnt = kern_mount(&erofs_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); goto out; } + erofs_pseudo_mnt = mnt; } domain->volume = sbi->volume; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 9474cd50da6d..361595433480 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -275,6 +275,7 @@ struct exfat_sb_info { spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; + struct rcu_head rcu; }; #define EXFAT_CACHE_VALID 0 diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 705710f93e2d..afdf13c34ff5 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -655,7 +655,6 @@ static int exfat_load_upcase_table(struct super_block *sb, unsigned int sect_size = sb->s_blocksize; unsigned int i, index = 0; u32 chksum = 0; - int ret; unsigned char skip = false; unsigned short *upcase_table; @@ -673,8 +672,7 @@ static int exfat_load_upcase_table(struct super_block *sb, if (!bh) { exfat_err(sb, "failed to read sector(0x%llx)", (unsigned long long)sector); - ret = -EIO; - goto free_table; + return -EIO; } sector++; for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) { @@ -701,15 +699,12 @@ static int exfat_load_upcase_table(struct super_block *sb, exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)", index, chksum, utbl_checksum); - ret = -EINVAL; -free_table: - exfat_free_upcase_table(sbi); - return ret; + return -EINVAL; } static int exfat_load_default_upcase_table(struct super_block *sb) { - int i, ret = -EIO; + int i; struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned char skip = false; unsigned short uni = 0, *upcase_table; @@ -740,8 +735,7 @@ static int exfat_load_default_upcase_table(struct super_block *sb) return 0; /* FATAL error: default upcase table has error */ - exfat_free_upcase_table(sbi); - return ret; + return -EIO; } int exfat_create_upcase_table(struct super_block *sb) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index d9d4fa91010b..fcb658267765 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -39,9 +39,6 @@ static void exfat_put_super(struct super_block *sb) exfat_free_bitmap(sbi); brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); - - unload_nls(sbi->nls_io); - exfat_free_upcase_table(sbi); } static int exfat_sync_fs(struct super_block *sb, int wait) @@ -600,7 +597,7 @@ static int __exfat_fill_super(struct super_block *sb) ret = exfat_load_bitmap(sb); if (ret) { exfat_err(sb, "failed to load alloc-bitmap"); - goto free_upcase_table; + goto free_bh; } ret = exfat_count_used_clusters(sb, &sbi->used_clusters); @@ -613,8 +610,6 @@ static int __exfat_fill_super(struct super_block *sb) free_alloc_bitmap: exfat_free_bitmap(sbi); -free_upcase_table: - exfat_free_upcase_table(sbi); free_bh: brelse(sbi->boot_bh); return ret; @@ -701,12 +696,10 @@ put_inode: sb->s_root = NULL; free_table: - exfat_free_upcase_table(sbi); exfat_free_bitmap(sbi); brelse(sbi->boot_bh); check_nls_io: - unload_nls(sbi->nls_io); return err; } @@ -771,13 +764,22 @@ static int exfat_init_fs_context(struct fs_context *fc) return 0; } +static void delayed_free(struct rcu_head *p) +{ + struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu); + + unload_nls(sbi->nls_io); + exfat_free_upcase_table(sbi); + exfat_free_sbi(sbi); +} + static void exfat_kill_sb(struct super_block *sb) { struct exfat_sb_info *sbi = sb->s_fs_info; kill_block_super(sb); if (sbi) - exfat_free_sbi(sbi); + call_rcu(&sbi->rcu, delayed_free); } static struct file_system_type exfat_fs_type = { diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 75bf1f88843c..645240cc0229 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); - if (IS_ERR(bh)) - return ERR_CAST(bh); - if (!bh || !ext4_buffer_uptodate(bh)) + if (IS_ERR(bh) || !bh) return ERR_PTR(-ECHILD); + if (!ext4_buffer_uptodate(bh)) { + brelse(bh); + return ERR_PTR(-ECHILD); + } } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 91e89e68177e..b6cad106c37e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -474,8 +474,7 @@ err: static void cuse_fc_release(struct fuse_conn *fc) { - struct cuse_conn *cc = fc_to_cc(fc); - kfree_rcu(cc, fc.rcu); + kfree(fc_to_cc(fc)); } /** diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1df83eebda92..bcbe34488862 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -888,6 +888,7 @@ struct fuse_mount { /* Entry on fc->mounts */ struct list_head fc_entry; + struct rcu_head rcu; }; static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2a6d44f91729..516ea2979a90 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -930,6 +930,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, } EXPORT_SYMBOL_GPL(fuse_conn_init); +static void delayed_release(struct rcu_head *p) +{ + struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + + put_user_ns(fc->user_ns); + fc->release(fc); +} + void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { @@ -941,13 +949,12 @@ void fuse_conn_put(struct fuse_conn *fc) if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); - put_user_ns(fc->user_ns); bucket = rcu_dereference_protected(fc->curr_bucket, 1); if (bucket) { WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } - fc->release(fc); + call_rcu(&fc->rcu, delayed_release); } } EXPORT_SYMBOL_GPL(fuse_conn_put); @@ -1366,7 +1373,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init); void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); - kfree_rcu(fc, rcu); + kfree(fc); } EXPORT_SYMBOL_GPL(fuse_free_conn); @@ -1902,7 +1909,7 @@ static void fuse_sb_destroy(struct super_block *sb) void fuse_mount_destroy(struct fuse_mount *fm) { fuse_conn_put(fm->fc); - kfree(fm); + kfree_rcu(fm, rcu); } EXPORT_SYMBOL(fuse_mount_destroy); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 7ededcb720c1..012a3d003fbe 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -190,6 +190,7 @@ struct hfsplus_sb_info { int work_queued; /* non-zero delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ + struct rcu_head rcu; }; #define HFSPLUS_SB_WRITEBACKUP 0 diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 1986b4f18a90..97920202790f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -277,6 +277,14 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb) spin_unlock(&sbi->work_lock); } +static void delayed_free(struct rcu_head *p) +{ + struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu); + + unload_nls(sbi->nls); + kfree(sbi); +} + static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); @@ -302,9 +310,7 @@ static void hfsplus_put_super(struct super_block *sb) hfs_btree_close(sbi->ext_tree); kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); - unload_nls(sbi->nls); - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; + call_rcu(&sbi->rcu, delayed_free); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/namei.c b/fs/namei.c index 4e0de939fea1..9342fa6a38c2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1717,7 +1717,11 @@ static inline int may_lookup(struct mnt_idmap *idmap, { if (nd->flags & LOOKUP_RCU) { int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (err != -ECHILD || !try_to_unlazy(nd)) + if (!err) // success, keep going + return 0; + if (!try_to_unlazy(nd)) + return -ECHILD; // redo it all non-lazy + if (err != -ECHILD) // hard error return err; } return inode_permission(idmap, nd->inode, MAY_EXEC); diff --git a/fs/namespace.c b/fs/namespace.c index 437f60e96d40..5a51315c6678 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4472,10 +4472,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) /* * If this is an attached mount make sure it's located in the callers * mount namespace. If it's not don't let the caller interact with it. - * If this is a detached mount make sure it has an anonymous mount - * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE. + * + * If this mount doesn't have a parent it's most often simply a + * detached mount with an anonymous mount namespace. IOW, something + * that's simply not attached yet. But there are apparently also users + * that do change mount properties on the rootfs itself. That obviously + * neither has a parent nor is it a detached mount so we cannot + * unconditionally check for detached mounts. */ - if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns))) + if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) goto out; /* diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index a3059b3168fd..9a0d32e4b422 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -477,6 +477,9 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + if (!iov_iter_count(from)) + return 0; + if ((iocb->ki_flags & IOCB_DIRECT) || test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) return netfs_unbuffered_write_iter(iocb, from); diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 60a40d293c87..bee047e20f5d 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -139,6 +139,9 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + if (!iov_iter_count(from)) + return 0; + trace_netfs_write_iter(iocb, from); netfs_stat(&netfs_n_rh_dio_write); @@ -146,7 +149,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret < 0) return ret; ret = generic_write_checks(iocb, from); - if (ret < 0) + if (ret <= 0) goto out; ret = file_remove_privs(file); if (ret < 0) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index e8ff1e61ce79..4261ad6c55b6 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -748,6 +748,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) if (!rreq->submitted) { netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + if (rreq->origin == NETFS_DIO_READ) + inode_dio_end(rreq->inode); ret = 0; goto out; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 44eca51b2808..fbdc9ca80f71 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -246,7 +246,7 @@ void nfs_free_client(struct nfs_client *clp) put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); - kfree(clp); + kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -1006,6 +1006,14 @@ struct nfs_server *nfs_alloc_server(void) } EXPORT_SYMBOL_GPL(nfs_alloc_server); +static void delayed_free(struct rcu_head *p) +{ + struct nfs_server *server = container_of(p, struct nfs_server, rcu); + + nfs_free_iostats(server->io_stats); + kfree(server); +} + /* * Free up a server record */ @@ -1031,10 +1039,9 @@ void nfs_free_server(struct nfs_server *server) ida_destroy(&server->lockowner_id); ida_destroy(&server->openowner_id); - nfs_free_iostats(server->io_stats); put_cred(server->cred); - kfree(server); nfs_release_automount_timer(); + call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c8ecbe999059..ac505671efbd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1431,9 +1431,9 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry) static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) { struct inode *inode = d_inode(dentry); - struct inode *dir = d_inode(dentry->d_parent); + struct inode *dir = d_inode_rcu(dentry->d_parent); - if (!nfs_verify_change_attribute(dir, verf)) + if (!dir || !nfs_verify_change_attribute(dir, verf)) return; if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) nfs_set_verifier_delegated(&verf); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b8e25ca51016..8586e2f5d243 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -265,20 +265,18 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, if (IS_ERR(old_file)) return PTR_ERR(old_file); + /* Try to use clone_file_range to clone up within the same fs */ + cloned = vfs_clone_file_range(old_file, 0, new_file, 0, len, 0); + if (cloned == len) + goto out_fput; + + /* Couldn't clone, so now we try to copy the data */ error = rw_verify_area(READ, old_file, &old_pos, len); if (!error) error = rw_verify_area(WRITE, new_file, &new_pos, len); if (error) goto out_fput; - /* Try to use clone_file_range to clone up within the same fs */ - ovl_start_write(dentry); - cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); - ovl_end_write(dentry); - if (cloned == len) - goto out_fput; - /* Couldn't clone, so now we try to copy the data */ - /* Check if lower fs supports seek operation */ if (old_file->f_mode & FMODE_LSEEK) skip_hole = true; diff --git a/fs/proc/base.c b/fs/proc/base.c index 98a031ac2648..18550c071d71 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1878,8 +1878,6 @@ void proc_pid_evict_inode(struct proc_inode *ei) hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } - - put_pid(pid); } struct inode *proc_pid_make_inode(struct super_block *sb, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index b33e490e3fd9..05350f3c2812 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -30,7 +30,6 @@ static void proc_evict_inode(struct inode *inode) { - struct proc_dir_entry *de; struct ctl_table_header *head; struct proc_inode *ei = PROC_I(inode); @@ -38,17 +37,8 @@ static void proc_evict_inode(struct inode *inode) clear_inode(inode); /* Stop tracking associated processes */ - if (ei->pid) { + if (ei->pid) proc_pid_evict_inode(ei); - ei->pid = NULL; - } - - /* Let go of any associated proc directory entry */ - de = ei->pde; - if (de) { - pde_put(de); - ei->pde = NULL; - } head = ei->sysctl; if (head) { @@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb) static void proc_free_inode(struct inode *inode) { + struct proc_inode *ei = PROC_I(inode); + + if (ei->pid) + put_pid(ei->pid); + /* Let go of any associated proc directory entry */ + if (ei->pde) + pde_put(ei->pde); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } diff --git a/fs/proc/root.c b/fs/proc/root.c index b55dbc70287b..06a297a27ba3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -271,7 +271,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); put_pid_ns(fs_info->pid_ns); - kfree(fs_info); + kfree_rcu(fs_info, rcu); } static struct file_system_type proc_fs_type = { diff --git a/fs/remap_range.c b/fs/remap_range.c index f8c1120b8311..de07f978ce3e 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -373,9 +373,9 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(generic_remap_file_range_prep); -loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { loff_t ret; @@ -391,23 +391,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; - ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, remap_flags); - if (ret < 0) - return ret; - - fsnotify_access(file_in); - fsnotify_modify(file_out); - return ret; -} -EXPORT_SYMBOL(do_clone_file_range); - -loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; @@ -417,10 +400,14 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, return ret; file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, - remap_flags); + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, remap_flags); file_end_write(file_out); + if (ret < 0) + return ret; + fsnotify_access(file_in); + fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(vfs_clone_file_range); diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 1daeb5714faa..3de5047a7ff9 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -242,6 +242,7 @@ replay_again: .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES, .disposition = FILE_OPEN, .fid = pfid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 2a4a4e3a8751..0c269396ae15 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1172,6 +1172,9 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode, { char *target_path; + if (!dentry) + return ERR_PTR(-ECHILD); + target_path = kmalloc(PATH_MAX, GFP_KERNEL); if (!target_path) return ERR_PTR(-ENOMEM); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index c86a72c9d9ec..53c75cfb33ab 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1378,6 +1378,7 @@ struct cifs_open_parms { struct cifs_fid *fid; umode_t mode; bool reconnect:1; + bool replay:1; /* indicates that this open is for a replay */ }; struct cifs_fid { diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index d03253f8f145..ac9595504f4b 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3444,8 +3444,18 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) * the user on mount */ if ((cifs_sb->ctx->wsize == 0) || - (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) - cifs_sb->ctx->wsize = server->ops->negotiate_wsize(tcon, ctx); + (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) { + cifs_sb->ctx->wsize = + round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE); + /* + * in the very unlikely event that the server sent a max write size under PAGE_SIZE, + * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096 + */ + if (cifs_sb->ctx->wsize == 0) { + cifs_sb->ctx->wsize = PAGE_SIZE; + cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n"); + } + } if ((cifs_sb->ctx->rsize == 0) || (cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx))) cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx); diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index aec8dbd1f9db..4b2f5aa2ea0e 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1111,6 +1111,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_wsize: ctx->wsize = result.uint_32; ctx->got_wsize = true; + if (ctx->wsize % PAGE_SIZE != 0) { + ctx->wsize = round_down(ctx->wsize, PAGE_SIZE); + if (ctx->wsize == 0) { + ctx->wsize = PAGE_SIZE; + cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE); + } else { + cifs_dbg(VFS, + "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n", + ctx->wsize, PAGE_SIZE); + } + } break; case Opt_acregmax: ctx->acregmax = HZ * result.uint_32; diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c index a6968573b775..4a517b280f2b 100644 --- a/fs/smb/client/namespace.c +++ b/fs/smb/client/namespace.c @@ -168,6 +168,21 @@ static char *automount_fullpath(struct dentry *dentry, void *page) return s; } +static void fs_context_set_ids(struct smb3_fs_context *ctx) +{ + kuid_t uid = current_fsuid(); + kgid_t gid = current_fsgid(); + + if (ctx->multiuser) { + if (!ctx->uid_specified) + ctx->linux_uid = uid; + if (!ctx->gid_specified) + ctx->linux_gid = gid; + } + if (!ctx->cruid_specified) + ctx->cred_uid = uid; +} + /* * Create a vfsmount that we can automount */ @@ -205,6 +220,7 @@ static struct vfsmount *cifs_do_automount(struct path *path) tmp.leaf_fullpath = NULL; tmp.UNC = tmp.prepath = NULL; tmp.dfs_root_ses = NULL; + fs_context_set_ids(&tmp); rc = smb3_fs_context_dup(ctx, &tmp); if (rc) { diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 83c898afc835..4695433fcf39 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -619,7 +619,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, goto out; } - while (bytes_left >= sizeof(*p)) { + while (bytes_left >= (ssize_t)sizeof(*p)) { memset(&tmp_iface, 0, sizeof(tmp_iface)); tmp_iface.speed = le64_to_cpu(p->LinkSpeed); tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; @@ -1204,6 +1204,7 @@ replay_again: .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, 0), .fid = &fid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -1569,6 +1570,7 @@ replay_again: .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, create_options), .fid = &fid, + .replay = !!(retries), }; if (qi.flags & PASSTHRU_FSCTL) { @@ -2295,6 +2297,7 @@ replay_again: .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, 0), .fid = fid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -2681,6 +2684,7 @@ replay_again: .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, 0), .fid = &fid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -5213,7 +5217,7 @@ static int smb2_create_reparse_symlink(const unsigned int xid, struct inode *new; struct kvec iov; __le16 *path; - char *sym; + char *sym, sep = CIFS_DIR_SEP(cifs_sb); u16 len, plen; int rc = 0; @@ -5227,7 +5231,8 @@ static int smb2_create_reparse_symlink(const unsigned int xid, .symlink_target = sym, }; - path = cifs_convert_path_to_utf16(symname, cifs_sb); + convert_delimiter(sym, sep); + path = cifs_convert_path_to_utf16(sym, cifs_sb); if (!path) { rc = -ENOMEM; goto out; @@ -5250,7 +5255,10 @@ static int smb2_create_reparse_symlink(const unsigned int xid, buf->PrintNameLength = cpu_to_le16(plen); memcpy(buf->PathBuffer, path, plen); buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0); + if (*sym != sep) + buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE); + convert_delimiter(sym, '/'); iov.iov_base = buf; iov.iov_len = len; new = smb2_get_reparse_inode(&data, inode->i_sb, xid, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 4085ce27fd38..608ee05491e2 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2404,8 +2404,13 @@ create_durable_v2_buf(struct cifs_open_parms *oparms) */ buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); - generate_random_uuid(buf->dcontext.CreateGuid); - memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); + + /* for replay, we should not overwrite the existing create guid */ + if (!oparms->replay) { + generate_random_uuid(buf->dcontext.CreateGuid); + memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); + } else + memcpy(buf->dcontext.CreateGuid, pfid->create_guid, 16); /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */ buf->Name[0] = 'D'; @@ -3142,6 +3147,7 @@ replay_again: /* reinitialize for possible replay */ flags = 0; server = cifs_pick_channel(ses); + oparms->replay = !!(retries); cifs_dbg(FYI, "create/open\n"); if (!ses || !server) diff --git a/fs/super.c b/fs/super.c index d35e85295489..d6efeba0d0ce 100644 --- a/fs/super.c +++ b/fs/super.c @@ -274,9 +274,10 @@ static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); - int i; - - for (i = 0; i < SB_FREEZE_LEVELS; i++) + security_sb_free(s); + put_user_ns(s->s_user_ns); + kfree(s->s_subtype); + for (int i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } @@ -296,9 +297,6 @@ static void destroy_unused_super(struct super_block *s) super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - security_sb_free(s); - put_user_ns(s->s_user_ns); - kfree(s->s_subtype); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); @@ -409,9 +407,6 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); - security_sb_free(s); - put_user_ns(s->s_user_ns); - kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); } } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 6ab2318a9c8e..dba5dcb62bef 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -348,7 +348,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, struct zonefs_inode_info *zi = ZONEFS_I(inode); if (error) { - zonefs_io_error(inode, true); + /* + * For Sync IOs, error recovery is called from + * zonefs_file_dio_write(). + */ + if (!is_sync_kiocb(iocb)) + zonefs_io_error(inode, true); return error; } @@ -491,6 +496,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ret = -EINVAL; goto inode_unlock; } + /* + * Advance the zone write pointer offset. This assumes that the + * IO will succeed, which is OK to do because we do not allow + * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO + * fails, the error path will correct the write pointer offset. + */ + z->z_wpoffset += count; + zonefs_inode_account_active(inode); mutex_unlock(&zi->i_truncate_mutex); } @@ -504,20 +517,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) if (ret == -ENOTBLK) ret = -EBUSY; - if (zonefs_zone_is_seq(z) && - (ret > 0 || ret == -EIOCBQUEUED)) { - if (ret > 0) - count = ret; - - /* - * Update the zone write pointer offset assuming the write - * operation succeeded. If it did not, the error recovery path - * will correct it. Also do active seq file accounting. - */ - mutex_lock(&zi->i_truncate_mutex); - z->z_wpoffset += count; - zonefs_inode_account_active(inode); - mutex_unlock(&zi->i_truncate_mutex); + /* + * For a failed IO or partial completion, trigger error recovery + * to update the zone write pointer offset to a correct value. + * For asynchronous IOs, zonefs_file_write_dio_end_io() may already + * have executed error recovery if the IO already completed when we + * reach here. However, we cannot know that and execute error recovery + * again (that will not change anything). + */ + if (zonefs_zone_is_seq(z)) { + if (ret > 0 && ret != count) + ret = -EIO; + if (ret < 0 && ret != -EIOCBQUEUED) + zonefs_io_error(inode, true); } inode_unlock: diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 93971742613a..b6e8e7c96251 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode) z->z_mode = inode->i_mode; } -struct zonefs_ioerr_data { - struct inode *inode; - bool write; -}; - static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, void *data) { - struct zonefs_ioerr_data *err = data; - struct inode *inode = err->inode; + struct blk_zone *z = data; + + *z = *zone; + return 0; +} + +static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, + bool write) +{ struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); @@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, data_size = zonefs_check_zone_condition(sb, z, zone); isize = i_size_read(inode); if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && - !err->write && isize == data_size) - return 0; + !write && isize == data_size) + return; /* * At this point, we detected either a bad zone or an inconsistency @@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * In all cases, warn about inode size inconsistency and handle the * IO error according to the zone condition and to the mount options. */ - if (zonefs_zone_is_seq(z) && isize != data_size) + if (isize != data_size) zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); @@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, zonefs_i_size_write(inode, data_size); z->z_wpoffset = data_size; zonefs_inode_account_active(inode); - - return 0; } /* @@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; - struct zonefs_sb_info *sbi = ZONEFS_SB(sb); unsigned int noio_flag; - unsigned int nr_zones = 1; - struct zonefs_ioerr_data err = { - .inode = inode, - .write = write, - }; + struct blk_zone zone; int ret; /* - * The only files that have more than one zone are conventional zone - * files with aggregated conventional zones, for which the inode zone - * size is always larger than the device zone size. + * Conventional zone have no write pointer and cannot become read-only + * or offline. So simply fake a report for a single or aggregated zone + * and let zonefs_handle_io_error() correct the zone inode information + * according to the mount options. */ - if (z->z_size > bdev_zone_sectors(sb->s_bdev)) - nr_zones = z->z_size >> - (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + if (!zonefs_zone_is_seq(z)) { + zone.start = z->z_sector; + zone.len = z->z_size >> SECTOR_SHIFT; + zone.wp = zone.start + zone.len; + zone.type = BLK_ZONE_TYPE_CONVENTIONAL; + zone.cond = BLK_ZONE_COND_NOT_WP; + zone.capacity = zone.len; + goto handle_io_error; + } /* * Memory allocations in blkdev_report_zones() can trigger a memory @@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write) * the GFP_NOIO context avoids both problems. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones, - zonefs_io_error_cb, &err); - if (ret != nr_zones) + ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1, + zonefs_io_error_cb, &zone); + memalloc_noio_restore(noio_flag); + + if (ret != 1) { zonefs_err(sb, "Get inode %lu zone information failed %d\n", inode->i_ino, ret); - memalloc_noio_restore(noio_flag); + zonefs_warn(sb, "remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + return; + } + +handle_io_error: + zonefs_handle_io_error(inode, &zone, write); } static struct kmem_cache *zonefs_inode_cachep; |